1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * linux/fs/ext4/file.c
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 1992, 1993, 1994, 1995
6*4882a593Smuzhiyun * Remy Card (card@masi.ibp.fr)
7*4882a593Smuzhiyun * Laboratoire MASI - Institut Blaise Pascal
8*4882a593Smuzhiyun * Universite Pierre et Marie Curie (Paris VI)
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * from
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * linux/fs/minix/file.c
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * Copyright (C) 1991, 1992 Linus Torvalds
15*4882a593Smuzhiyun *
16*4882a593Smuzhiyun * ext4 fs regular file handling primitives
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * 64-bit file support on 64-bit platforms by Jakub Jelinek
19*4882a593Smuzhiyun * (jj@sunsite.ms.mff.cuni.cz)
20*4882a593Smuzhiyun */
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun #include <linux/time.h>
23*4882a593Smuzhiyun #include <linux/fs.h>
24*4882a593Smuzhiyun #include <linux/iomap.h>
25*4882a593Smuzhiyun #include <linux/mount.h>
26*4882a593Smuzhiyun #include <linux/path.h>
27*4882a593Smuzhiyun #include <linux/dax.h>
28*4882a593Smuzhiyun #include <linux/quotaops.h>
29*4882a593Smuzhiyun #include <linux/pagevec.h>
30*4882a593Smuzhiyun #include <linux/uio.h>
31*4882a593Smuzhiyun #include <linux/mman.h>
32*4882a593Smuzhiyun #include <linux/backing-dev.h>
33*4882a593Smuzhiyun #include "ext4.h"
34*4882a593Smuzhiyun #include "ext4_jbd2.h"
35*4882a593Smuzhiyun #include "xattr.h"
36*4882a593Smuzhiyun #include "acl.h"
37*4882a593Smuzhiyun #include "truncate.h"
38*4882a593Smuzhiyun
ext4_dio_supported(struct kiocb * iocb,struct iov_iter * iter)39*4882a593Smuzhiyun static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
40*4882a593Smuzhiyun {
41*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun if (!fscrypt_dio_supported(iocb, iter))
44*4882a593Smuzhiyun return false;
45*4882a593Smuzhiyun if (fsverity_active(inode))
46*4882a593Smuzhiyun return false;
47*4882a593Smuzhiyun if (ext4_should_journal_data(inode))
48*4882a593Smuzhiyun return false;
49*4882a593Smuzhiyun if (ext4_has_inline_data(inode))
50*4882a593Smuzhiyun return false;
51*4882a593Smuzhiyun return true;
52*4882a593Smuzhiyun }
53*4882a593Smuzhiyun
ext4_dio_read_iter(struct kiocb * iocb,struct iov_iter * to)54*4882a593Smuzhiyun static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
55*4882a593Smuzhiyun {
56*4882a593Smuzhiyun ssize_t ret;
57*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_NOWAIT) {
60*4882a593Smuzhiyun if (!inode_trylock_shared(inode))
61*4882a593Smuzhiyun return -EAGAIN;
62*4882a593Smuzhiyun } else {
63*4882a593Smuzhiyun inode_lock_shared(inode);
64*4882a593Smuzhiyun }
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun if (!ext4_dio_supported(iocb, to)) {
67*4882a593Smuzhiyun inode_unlock_shared(inode);
68*4882a593Smuzhiyun /*
69*4882a593Smuzhiyun * Fallback to buffered I/O if the operation being performed on
70*4882a593Smuzhiyun * the inode is not supported by direct I/O. The IOCB_DIRECT
71*4882a593Smuzhiyun * flag needs to be cleared here in order to ensure that the
72*4882a593Smuzhiyun * direct I/O path within generic_file_read_iter() is not
73*4882a593Smuzhiyun * taken.
74*4882a593Smuzhiyun */
75*4882a593Smuzhiyun iocb->ki_flags &= ~IOCB_DIRECT;
76*4882a593Smuzhiyun return generic_file_read_iter(iocb, to);
77*4882a593Smuzhiyun }
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
80*4882a593Smuzhiyun is_sync_kiocb(iocb));
81*4882a593Smuzhiyun inode_unlock_shared(inode);
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun file_accessed(iocb->ki_filp);
84*4882a593Smuzhiyun return ret;
85*4882a593Smuzhiyun }
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun #ifdef CONFIG_FS_DAX
ext4_dax_read_iter(struct kiocb * iocb,struct iov_iter * to)88*4882a593Smuzhiyun static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
89*4882a593Smuzhiyun {
90*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
91*4882a593Smuzhiyun ssize_t ret;
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_NOWAIT) {
94*4882a593Smuzhiyun if (!inode_trylock_shared(inode))
95*4882a593Smuzhiyun return -EAGAIN;
96*4882a593Smuzhiyun } else {
97*4882a593Smuzhiyun inode_lock_shared(inode);
98*4882a593Smuzhiyun }
99*4882a593Smuzhiyun /*
100*4882a593Smuzhiyun * Recheck under inode lock - at this point we are sure it cannot
101*4882a593Smuzhiyun * change anymore
102*4882a593Smuzhiyun */
103*4882a593Smuzhiyun if (!IS_DAX(inode)) {
104*4882a593Smuzhiyun inode_unlock_shared(inode);
105*4882a593Smuzhiyun /* Fallback to buffered IO in case we cannot support DAX */
106*4882a593Smuzhiyun return generic_file_read_iter(iocb, to);
107*4882a593Smuzhiyun }
108*4882a593Smuzhiyun ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
109*4882a593Smuzhiyun inode_unlock_shared(inode);
110*4882a593Smuzhiyun
111*4882a593Smuzhiyun file_accessed(iocb->ki_filp);
112*4882a593Smuzhiyun return ret;
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun #endif
115*4882a593Smuzhiyun
ext4_file_read_iter(struct kiocb * iocb,struct iov_iter * to)116*4882a593Smuzhiyun static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
117*4882a593Smuzhiyun {
118*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
121*4882a593Smuzhiyun return -EIO;
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun if (!iov_iter_count(to))
124*4882a593Smuzhiyun return 0; /* skip atime */
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun #ifdef CONFIG_FS_DAX
127*4882a593Smuzhiyun if (IS_DAX(inode))
128*4882a593Smuzhiyun return ext4_dax_read_iter(iocb, to);
129*4882a593Smuzhiyun #endif
130*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_DIRECT)
131*4882a593Smuzhiyun return ext4_dio_read_iter(iocb, to);
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun return generic_file_read_iter(iocb, to);
134*4882a593Smuzhiyun }
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun /*
137*4882a593Smuzhiyun * Called when an inode is released. Note that this is different
138*4882a593Smuzhiyun * from ext4_file_open: open gets called at every open, but release
139*4882a593Smuzhiyun * gets called only when /all/ the files are closed.
140*4882a593Smuzhiyun */
ext4_release_file(struct inode * inode,struct file * filp)141*4882a593Smuzhiyun static int ext4_release_file(struct inode *inode, struct file *filp)
142*4882a593Smuzhiyun {
143*4882a593Smuzhiyun if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
144*4882a593Smuzhiyun ext4_alloc_da_blocks(inode);
145*4882a593Smuzhiyun ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
146*4882a593Smuzhiyun }
147*4882a593Smuzhiyun /* if we are the last writer on the inode, drop the block reservation */
148*4882a593Smuzhiyun if ((filp->f_mode & FMODE_WRITE) &&
149*4882a593Smuzhiyun (atomic_read(&inode->i_writecount) == 1) &&
150*4882a593Smuzhiyun !EXT4_I(inode)->i_reserved_data_blocks) {
151*4882a593Smuzhiyun down_write(&EXT4_I(inode)->i_data_sem);
152*4882a593Smuzhiyun ext4_discard_preallocations(inode, 0);
153*4882a593Smuzhiyun up_write(&EXT4_I(inode)->i_data_sem);
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun if (is_dx(inode) && filp->private_data)
156*4882a593Smuzhiyun ext4_htree_free_dir_info(filp->private_data);
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun return 0;
159*4882a593Smuzhiyun }
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun /*
162*4882a593Smuzhiyun * This tests whether the IO in question is block-aligned or not.
163*4882a593Smuzhiyun * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
164*4882a593Smuzhiyun * are converted to written only after the IO is complete. Until they are
165*4882a593Smuzhiyun * mapped, these blocks appear as holes, so dio_zero_block() will assume that
166*4882a593Smuzhiyun * it needs to zero out portions of the start and/or end block. If 2 AIO
167*4882a593Smuzhiyun * threads are at work on the same unwritten block, they must be synchronized
168*4882a593Smuzhiyun * or one thread will zero the other's data, causing corruption.
169*4882a593Smuzhiyun */
170*4882a593Smuzhiyun static bool
ext4_unaligned_io(struct inode * inode,struct iov_iter * from,loff_t pos)171*4882a593Smuzhiyun ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
172*4882a593Smuzhiyun {
173*4882a593Smuzhiyun struct super_block *sb = inode->i_sb;
174*4882a593Smuzhiyun unsigned long blockmask = sb->s_blocksize - 1;
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun if ((pos | iov_iter_alignment(from)) & blockmask)
177*4882a593Smuzhiyun return true;
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun return false;
180*4882a593Smuzhiyun }
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun static bool
ext4_extending_io(struct inode * inode,loff_t offset,size_t len)183*4882a593Smuzhiyun ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
184*4882a593Smuzhiyun {
185*4882a593Smuzhiyun if (offset + len > i_size_read(inode) ||
186*4882a593Smuzhiyun offset + len > EXT4_I(inode)->i_disksize)
187*4882a593Smuzhiyun return true;
188*4882a593Smuzhiyun return false;
189*4882a593Smuzhiyun }
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun /* Is IO overwriting allocated and initialized blocks? */
ext4_overwrite_io(struct inode * inode,loff_t pos,loff_t len)192*4882a593Smuzhiyun static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
193*4882a593Smuzhiyun {
194*4882a593Smuzhiyun struct ext4_map_blocks map;
195*4882a593Smuzhiyun unsigned int blkbits = inode->i_blkbits;
196*4882a593Smuzhiyun int err, blklen;
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun if (pos + len > i_size_read(inode))
199*4882a593Smuzhiyun return false;
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun map.m_lblk = pos >> blkbits;
202*4882a593Smuzhiyun map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
203*4882a593Smuzhiyun blklen = map.m_len;
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun err = ext4_map_blocks(NULL, inode, &map, 0);
206*4882a593Smuzhiyun /*
207*4882a593Smuzhiyun * 'err==len' means that all of the blocks have been preallocated,
208*4882a593Smuzhiyun * regardless of whether they have been initialized or not. To exclude
209*4882a593Smuzhiyun * unwritten extents, we need to check m_flags.
210*4882a593Smuzhiyun */
211*4882a593Smuzhiyun return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
212*4882a593Smuzhiyun }
213*4882a593Smuzhiyun
ext4_generic_write_checks(struct kiocb * iocb,struct iov_iter * from)214*4882a593Smuzhiyun static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
215*4882a593Smuzhiyun struct iov_iter *from)
216*4882a593Smuzhiyun {
217*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
218*4882a593Smuzhiyun ssize_t ret;
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun if (unlikely(IS_IMMUTABLE(inode)))
221*4882a593Smuzhiyun return -EPERM;
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun ret = generic_write_checks(iocb, from);
224*4882a593Smuzhiyun if (ret <= 0)
225*4882a593Smuzhiyun return ret;
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun /*
228*4882a593Smuzhiyun * If we have encountered a bitmap-format file, the size limit
229*4882a593Smuzhiyun * is smaller than s_maxbytes, which is for extent-mapped files.
230*4882a593Smuzhiyun */
231*4882a593Smuzhiyun if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
232*4882a593Smuzhiyun struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
235*4882a593Smuzhiyun return -EFBIG;
236*4882a593Smuzhiyun iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
237*4882a593Smuzhiyun }
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun return iov_iter_count(from);
240*4882a593Smuzhiyun }
241*4882a593Smuzhiyun
ext4_write_checks(struct kiocb * iocb,struct iov_iter * from)242*4882a593Smuzhiyun static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
243*4882a593Smuzhiyun {
244*4882a593Smuzhiyun ssize_t ret, count;
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun count = ext4_generic_write_checks(iocb, from);
247*4882a593Smuzhiyun if (count <= 0)
248*4882a593Smuzhiyun return count;
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun ret = file_modified(iocb->ki_filp);
251*4882a593Smuzhiyun if (ret)
252*4882a593Smuzhiyun return ret;
253*4882a593Smuzhiyun return count;
254*4882a593Smuzhiyun }
255*4882a593Smuzhiyun
ext4_buffered_write_iter(struct kiocb * iocb,struct iov_iter * from)256*4882a593Smuzhiyun static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
257*4882a593Smuzhiyun struct iov_iter *from)
258*4882a593Smuzhiyun {
259*4882a593Smuzhiyun ssize_t ret;
260*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_NOWAIT)
263*4882a593Smuzhiyun return -EOPNOTSUPP;
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun ext4_fc_start_update(inode);
266*4882a593Smuzhiyun inode_lock(inode);
267*4882a593Smuzhiyun ret = ext4_write_checks(iocb, from);
268*4882a593Smuzhiyun if (ret <= 0)
269*4882a593Smuzhiyun goto out;
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun current->backing_dev_info = inode_to_bdi(inode);
272*4882a593Smuzhiyun ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
273*4882a593Smuzhiyun current->backing_dev_info = NULL;
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun out:
276*4882a593Smuzhiyun inode_unlock(inode);
277*4882a593Smuzhiyun ext4_fc_stop_update(inode);
278*4882a593Smuzhiyun if (likely(ret > 0)) {
279*4882a593Smuzhiyun iocb->ki_pos += ret;
280*4882a593Smuzhiyun ret = generic_write_sync(iocb, ret);
281*4882a593Smuzhiyun }
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun return ret;
284*4882a593Smuzhiyun }
285*4882a593Smuzhiyun
ext4_handle_inode_extension(struct inode * inode,loff_t offset,ssize_t written,size_t count)286*4882a593Smuzhiyun static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
287*4882a593Smuzhiyun ssize_t written, size_t count)
288*4882a593Smuzhiyun {
289*4882a593Smuzhiyun handle_t *handle;
290*4882a593Smuzhiyun bool truncate = false;
291*4882a593Smuzhiyun u8 blkbits = inode->i_blkbits;
292*4882a593Smuzhiyun ext4_lblk_t written_blk, end_blk;
293*4882a593Smuzhiyun int ret;
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun /*
296*4882a593Smuzhiyun * Note that EXT4_I(inode)->i_disksize can get extended up to
297*4882a593Smuzhiyun * inode->i_size while the I/O was running due to writeback of delalloc
298*4882a593Smuzhiyun * blocks. But, the code in ext4_iomap_alloc() is careful to use
299*4882a593Smuzhiyun * zeroed/unwritten extents if this is possible; thus we won't leave
300*4882a593Smuzhiyun * uninitialized blocks in a file even if we didn't succeed in writing
301*4882a593Smuzhiyun * as much as we intended.
302*4882a593Smuzhiyun */
303*4882a593Smuzhiyun WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
304*4882a593Smuzhiyun if (offset + count <= EXT4_I(inode)->i_disksize) {
305*4882a593Smuzhiyun /*
306*4882a593Smuzhiyun * We need to ensure that the inode is removed from the orphan
307*4882a593Smuzhiyun * list if it has been added prematurely, due to writeback of
308*4882a593Smuzhiyun * delalloc blocks.
309*4882a593Smuzhiyun */
310*4882a593Smuzhiyun if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
311*4882a593Smuzhiyun handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun if (IS_ERR(handle)) {
314*4882a593Smuzhiyun ext4_orphan_del(NULL, inode);
315*4882a593Smuzhiyun return PTR_ERR(handle);
316*4882a593Smuzhiyun }
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun ext4_orphan_del(handle, inode);
319*4882a593Smuzhiyun ext4_journal_stop(handle);
320*4882a593Smuzhiyun }
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun return written;
323*4882a593Smuzhiyun }
324*4882a593Smuzhiyun
325*4882a593Smuzhiyun if (written < 0)
326*4882a593Smuzhiyun goto truncate;
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
329*4882a593Smuzhiyun if (IS_ERR(handle)) {
330*4882a593Smuzhiyun written = PTR_ERR(handle);
331*4882a593Smuzhiyun goto truncate;
332*4882a593Smuzhiyun }
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun if (ext4_update_inode_size(inode, offset + written)) {
335*4882a593Smuzhiyun ret = ext4_mark_inode_dirty(handle, inode);
336*4882a593Smuzhiyun if (unlikely(ret)) {
337*4882a593Smuzhiyun written = ret;
338*4882a593Smuzhiyun ext4_journal_stop(handle);
339*4882a593Smuzhiyun goto truncate;
340*4882a593Smuzhiyun }
341*4882a593Smuzhiyun }
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun /*
344*4882a593Smuzhiyun * We may need to truncate allocated but not written blocks beyond EOF.
345*4882a593Smuzhiyun */
346*4882a593Smuzhiyun written_blk = ALIGN(offset + written, 1 << blkbits);
347*4882a593Smuzhiyun end_blk = ALIGN(offset + count, 1 << blkbits);
348*4882a593Smuzhiyun if (written_blk < end_blk && ext4_can_truncate(inode))
349*4882a593Smuzhiyun truncate = true;
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun /*
352*4882a593Smuzhiyun * Remove the inode from the orphan list if it has been extended and
353*4882a593Smuzhiyun * everything went OK.
354*4882a593Smuzhiyun */
355*4882a593Smuzhiyun if (!truncate && inode->i_nlink)
356*4882a593Smuzhiyun ext4_orphan_del(handle, inode);
357*4882a593Smuzhiyun ext4_journal_stop(handle);
358*4882a593Smuzhiyun
359*4882a593Smuzhiyun if (truncate) {
360*4882a593Smuzhiyun truncate:
361*4882a593Smuzhiyun ext4_truncate_failed_write(inode);
362*4882a593Smuzhiyun /*
363*4882a593Smuzhiyun * If the truncate operation failed early, then the inode may
364*4882a593Smuzhiyun * still be on the orphan list. In that case, we need to try
365*4882a593Smuzhiyun * remove the inode from the in-memory linked list.
366*4882a593Smuzhiyun */
367*4882a593Smuzhiyun if (inode->i_nlink)
368*4882a593Smuzhiyun ext4_orphan_del(NULL, inode);
369*4882a593Smuzhiyun }
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun return written;
372*4882a593Smuzhiyun }
373*4882a593Smuzhiyun
ext4_dio_write_end_io(struct kiocb * iocb,ssize_t size,int error,unsigned int flags)374*4882a593Smuzhiyun static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
375*4882a593Smuzhiyun int error, unsigned int flags)
376*4882a593Smuzhiyun {
377*4882a593Smuzhiyun loff_t pos = iocb->ki_pos;
378*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun if (error)
381*4882a593Smuzhiyun return error;
382*4882a593Smuzhiyun
383*4882a593Smuzhiyun if (size && flags & IOMAP_DIO_UNWRITTEN) {
384*4882a593Smuzhiyun error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
385*4882a593Smuzhiyun if (error < 0)
386*4882a593Smuzhiyun return error;
387*4882a593Smuzhiyun }
388*4882a593Smuzhiyun /*
389*4882a593Smuzhiyun * If we are extending the file, we have to update i_size here before
390*4882a593Smuzhiyun * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
391*4882a593Smuzhiyun * buffered reads could zero out too much from page cache pages. Update
392*4882a593Smuzhiyun * of on-disk size will happen later in ext4_dio_write_iter() where
393*4882a593Smuzhiyun * we have enough information to also perform orphan list handling etc.
394*4882a593Smuzhiyun * Note that we perform all extending writes synchronously under
395*4882a593Smuzhiyun * i_rwsem held exclusively so i_size update is safe here in that case.
396*4882a593Smuzhiyun * If the write was not extending, we cannot see pos > i_size here
397*4882a593Smuzhiyun * because operations reducing i_size like truncate wait for all
398*4882a593Smuzhiyun * outstanding DIO before updating i_size.
399*4882a593Smuzhiyun */
400*4882a593Smuzhiyun pos += size;
401*4882a593Smuzhiyun if (pos > i_size_read(inode))
402*4882a593Smuzhiyun i_size_write(inode, pos);
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun return 0;
405*4882a593Smuzhiyun }
406*4882a593Smuzhiyun
407*4882a593Smuzhiyun static const struct iomap_dio_ops ext4_dio_write_ops = {
408*4882a593Smuzhiyun .end_io = ext4_dio_write_end_io,
409*4882a593Smuzhiyun };
410*4882a593Smuzhiyun
411*4882a593Smuzhiyun /*
412*4882a593Smuzhiyun * The intention here is to start with shared lock acquired then see if any
413*4882a593Smuzhiyun * condition requires an exclusive inode lock. If yes, then we restart the
414*4882a593Smuzhiyun * whole operation by releasing the shared lock and acquiring exclusive lock.
415*4882a593Smuzhiyun *
416*4882a593Smuzhiyun * - For unaligned_io we never take shared lock as it may cause data corruption
417*4882a593Smuzhiyun * when two unaligned IO tries to modify the same block e.g. while zeroing.
418*4882a593Smuzhiyun *
419*4882a593Smuzhiyun * - For extending writes case we don't take the shared lock, since it requires
420*4882a593Smuzhiyun * updating inode i_disksize and/or orphan handling with exclusive lock.
421*4882a593Smuzhiyun *
422*4882a593Smuzhiyun * - shared locking will only be true mostly with overwrites. Otherwise we will
423*4882a593Smuzhiyun * switch to exclusive i_rwsem lock.
424*4882a593Smuzhiyun */
ext4_dio_write_checks(struct kiocb * iocb,struct iov_iter * from,bool * ilock_shared,bool * extend)425*4882a593Smuzhiyun static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
426*4882a593Smuzhiyun bool *ilock_shared, bool *extend)
427*4882a593Smuzhiyun {
428*4882a593Smuzhiyun struct file *file = iocb->ki_filp;
429*4882a593Smuzhiyun struct inode *inode = file_inode(file);
430*4882a593Smuzhiyun loff_t offset;
431*4882a593Smuzhiyun size_t count;
432*4882a593Smuzhiyun ssize_t ret;
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun restart:
435*4882a593Smuzhiyun ret = ext4_generic_write_checks(iocb, from);
436*4882a593Smuzhiyun if (ret <= 0)
437*4882a593Smuzhiyun goto out;
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun offset = iocb->ki_pos;
440*4882a593Smuzhiyun count = ret;
441*4882a593Smuzhiyun if (ext4_extending_io(inode, offset, count))
442*4882a593Smuzhiyun *extend = true;
443*4882a593Smuzhiyun /*
444*4882a593Smuzhiyun * Determine whether the IO operation will overwrite allocated
445*4882a593Smuzhiyun * and initialized blocks.
446*4882a593Smuzhiyun * We need exclusive i_rwsem for changing security info
447*4882a593Smuzhiyun * in file_modified().
448*4882a593Smuzhiyun */
449*4882a593Smuzhiyun if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
450*4882a593Smuzhiyun !ext4_overwrite_io(inode, offset, count))) {
451*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_NOWAIT) {
452*4882a593Smuzhiyun ret = -EAGAIN;
453*4882a593Smuzhiyun goto out;
454*4882a593Smuzhiyun }
455*4882a593Smuzhiyun inode_unlock_shared(inode);
456*4882a593Smuzhiyun *ilock_shared = false;
457*4882a593Smuzhiyun inode_lock(inode);
458*4882a593Smuzhiyun goto restart;
459*4882a593Smuzhiyun }
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun ret = file_modified(file);
462*4882a593Smuzhiyun if (ret < 0)
463*4882a593Smuzhiyun goto out;
464*4882a593Smuzhiyun
465*4882a593Smuzhiyun return count;
466*4882a593Smuzhiyun out:
467*4882a593Smuzhiyun if (*ilock_shared)
468*4882a593Smuzhiyun inode_unlock_shared(inode);
469*4882a593Smuzhiyun else
470*4882a593Smuzhiyun inode_unlock(inode);
471*4882a593Smuzhiyun return ret;
472*4882a593Smuzhiyun }
473*4882a593Smuzhiyun
ext4_dio_write_iter(struct kiocb * iocb,struct iov_iter * from)474*4882a593Smuzhiyun static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
475*4882a593Smuzhiyun {
476*4882a593Smuzhiyun ssize_t ret;
477*4882a593Smuzhiyun handle_t *handle;
478*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
479*4882a593Smuzhiyun loff_t offset = iocb->ki_pos;
480*4882a593Smuzhiyun size_t count = iov_iter_count(from);
481*4882a593Smuzhiyun const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
482*4882a593Smuzhiyun bool extend = false, unaligned_io = false;
483*4882a593Smuzhiyun bool ilock_shared = true;
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun /*
486*4882a593Smuzhiyun * We initially start with shared inode lock unless it is
487*4882a593Smuzhiyun * unaligned IO which needs exclusive lock anyways.
488*4882a593Smuzhiyun */
489*4882a593Smuzhiyun if (ext4_unaligned_io(inode, from, offset)) {
490*4882a593Smuzhiyun unaligned_io = true;
491*4882a593Smuzhiyun ilock_shared = false;
492*4882a593Smuzhiyun }
493*4882a593Smuzhiyun /*
494*4882a593Smuzhiyun * Quick check here without any i_rwsem lock to see if it is extending
495*4882a593Smuzhiyun * IO. A more reliable check is done in ext4_dio_write_checks() with
496*4882a593Smuzhiyun * proper locking in place.
497*4882a593Smuzhiyun */
498*4882a593Smuzhiyun if (offset + count > i_size_read(inode))
499*4882a593Smuzhiyun ilock_shared = false;
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_NOWAIT) {
502*4882a593Smuzhiyun if (ilock_shared) {
503*4882a593Smuzhiyun if (!inode_trylock_shared(inode))
504*4882a593Smuzhiyun return -EAGAIN;
505*4882a593Smuzhiyun } else {
506*4882a593Smuzhiyun if (!inode_trylock(inode))
507*4882a593Smuzhiyun return -EAGAIN;
508*4882a593Smuzhiyun }
509*4882a593Smuzhiyun } else {
510*4882a593Smuzhiyun if (ilock_shared)
511*4882a593Smuzhiyun inode_lock_shared(inode);
512*4882a593Smuzhiyun else
513*4882a593Smuzhiyun inode_lock(inode);
514*4882a593Smuzhiyun }
515*4882a593Smuzhiyun
516*4882a593Smuzhiyun /* Fallback to buffered I/O if the inode does not support direct I/O. */
517*4882a593Smuzhiyun if (!ext4_dio_supported(iocb, from)) {
518*4882a593Smuzhiyun if (ilock_shared)
519*4882a593Smuzhiyun inode_unlock_shared(inode);
520*4882a593Smuzhiyun else
521*4882a593Smuzhiyun inode_unlock(inode);
522*4882a593Smuzhiyun return ext4_buffered_write_iter(iocb, from);
523*4882a593Smuzhiyun }
524*4882a593Smuzhiyun
525*4882a593Smuzhiyun ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
526*4882a593Smuzhiyun if (ret <= 0)
527*4882a593Smuzhiyun return ret;
528*4882a593Smuzhiyun
529*4882a593Smuzhiyun /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
530*4882a593Smuzhiyun if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
531*4882a593Smuzhiyun ret = -EAGAIN;
532*4882a593Smuzhiyun goto out;
533*4882a593Smuzhiyun }
534*4882a593Smuzhiyun /*
535*4882a593Smuzhiyun * Make sure inline data cannot be created anymore since we are going
536*4882a593Smuzhiyun * to allocate blocks for DIO. We know the inode does not have any
537*4882a593Smuzhiyun * inline data now because ext4_dio_supported() checked for that.
538*4882a593Smuzhiyun */
539*4882a593Smuzhiyun ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
540*4882a593Smuzhiyun
541*4882a593Smuzhiyun offset = iocb->ki_pos;
542*4882a593Smuzhiyun count = ret;
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun /*
545*4882a593Smuzhiyun * Unaligned direct IO must be serialized among each other as zeroing
546*4882a593Smuzhiyun * of partial blocks of two competing unaligned IOs can result in data
547*4882a593Smuzhiyun * corruption.
548*4882a593Smuzhiyun *
549*4882a593Smuzhiyun * So we make sure we don't allow any unaligned IO in flight.
550*4882a593Smuzhiyun * For IOs where we need not wait (like unaligned non-AIO DIO),
551*4882a593Smuzhiyun * below inode_dio_wait() may anyway become a no-op, since we start
552*4882a593Smuzhiyun * with exclusive lock.
553*4882a593Smuzhiyun */
554*4882a593Smuzhiyun if (unaligned_io)
555*4882a593Smuzhiyun inode_dio_wait(inode);
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun if (extend) {
558*4882a593Smuzhiyun handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
559*4882a593Smuzhiyun if (IS_ERR(handle)) {
560*4882a593Smuzhiyun ret = PTR_ERR(handle);
561*4882a593Smuzhiyun goto out;
562*4882a593Smuzhiyun }
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun ext4_fc_start_update(inode);
565*4882a593Smuzhiyun ret = ext4_orphan_add(handle, inode);
566*4882a593Smuzhiyun ext4_fc_stop_update(inode);
567*4882a593Smuzhiyun if (ret) {
568*4882a593Smuzhiyun ext4_journal_stop(handle);
569*4882a593Smuzhiyun goto out;
570*4882a593Smuzhiyun }
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun ext4_journal_stop(handle);
573*4882a593Smuzhiyun }
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun if (ilock_shared)
576*4882a593Smuzhiyun iomap_ops = &ext4_iomap_overwrite_ops;
577*4882a593Smuzhiyun ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
578*4882a593Smuzhiyun is_sync_kiocb(iocb) || unaligned_io || extend);
579*4882a593Smuzhiyun if (ret == -ENOTBLK)
580*4882a593Smuzhiyun ret = 0;
581*4882a593Smuzhiyun
582*4882a593Smuzhiyun if (extend)
583*4882a593Smuzhiyun ret = ext4_handle_inode_extension(inode, offset, ret, count);
584*4882a593Smuzhiyun
585*4882a593Smuzhiyun out:
586*4882a593Smuzhiyun if (ilock_shared)
587*4882a593Smuzhiyun inode_unlock_shared(inode);
588*4882a593Smuzhiyun else
589*4882a593Smuzhiyun inode_unlock(inode);
590*4882a593Smuzhiyun
591*4882a593Smuzhiyun if (ret >= 0 && iov_iter_count(from)) {
592*4882a593Smuzhiyun ssize_t err;
593*4882a593Smuzhiyun loff_t endbyte;
594*4882a593Smuzhiyun
595*4882a593Smuzhiyun offset = iocb->ki_pos;
596*4882a593Smuzhiyun err = ext4_buffered_write_iter(iocb, from);
597*4882a593Smuzhiyun if (err < 0)
598*4882a593Smuzhiyun return err;
599*4882a593Smuzhiyun
600*4882a593Smuzhiyun /*
601*4882a593Smuzhiyun * We need to ensure that the pages within the page cache for
602*4882a593Smuzhiyun * the range covered by this I/O are written to disk and
603*4882a593Smuzhiyun * invalidated. This is in attempt to preserve the expected
604*4882a593Smuzhiyun * direct I/O semantics in the case we fallback to buffered I/O
605*4882a593Smuzhiyun * to complete off the I/O request.
606*4882a593Smuzhiyun */
607*4882a593Smuzhiyun ret += err;
608*4882a593Smuzhiyun endbyte = offset + err - 1;
609*4882a593Smuzhiyun err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
610*4882a593Smuzhiyun offset, endbyte);
611*4882a593Smuzhiyun if (!err)
612*4882a593Smuzhiyun invalidate_mapping_pages(iocb->ki_filp->f_mapping,
613*4882a593Smuzhiyun offset >> PAGE_SHIFT,
614*4882a593Smuzhiyun endbyte >> PAGE_SHIFT);
615*4882a593Smuzhiyun }
616*4882a593Smuzhiyun
617*4882a593Smuzhiyun return ret;
618*4882a593Smuzhiyun }
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun #ifdef CONFIG_FS_DAX
621*4882a593Smuzhiyun static ssize_t
ext4_dax_write_iter(struct kiocb * iocb,struct iov_iter * from)622*4882a593Smuzhiyun ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
623*4882a593Smuzhiyun {
624*4882a593Smuzhiyun ssize_t ret;
625*4882a593Smuzhiyun size_t count;
626*4882a593Smuzhiyun loff_t offset;
627*4882a593Smuzhiyun handle_t *handle;
628*4882a593Smuzhiyun bool extend = false;
629*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_NOWAIT) {
632*4882a593Smuzhiyun if (!inode_trylock(inode))
633*4882a593Smuzhiyun return -EAGAIN;
634*4882a593Smuzhiyun } else {
635*4882a593Smuzhiyun inode_lock(inode);
636*4882a593Smuzhiyun }
637*4882a593Smuzhiyun
638*4882a593Smuzhiyun ret = ext4_write_checks(iocb, from);
639*4882a593Smuzhiyun if (ret <= 0)
640*4882a593Smuzhiyun goto out;
641*4882a593Smuzhiyun
642*4882a593Smuzhiyun offset = iocb->ki_pos;
643*4882a593Smuzhiyun count = iov_iter_count(from);
644*4882a593Smuzhiyun
645*4882a593Smuzhiyun if (offset + count > EXT4_I(inode)->i_disksize) {
646*4882a593Smuzhiyun handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
647*4882a593Smuzhiyun if (IS_ERR(handle)) {
648*4882a593Smuzhiyun ret = PTR_ERR(handle);
649*4882a593Smuzhiyun goto out;
650*4882a593Smuzhiyun }
651*4882a593Smuzhiyun
652*4882a593Smuzhiyun ret = ext4_orphan_add(handle, inode);
653*4882a593Smuzhiyun if (ret) {
654*4882a593Smuzhiyun ext4_journal_stop(handle);
655*4882a593Smuzhiyun goto out;
656*4882a593Smuzhiyun }
657*4882a593Smuzhiyun
658*4882a593Smuzhiyun extend = true;
659*4882a593Smuzhiyun ext4_journal_stop(handle);
660*4882a593Smuzhiyun }
661*4882a593Smuzhiyun
662*4882a593Smuzhiyun ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun if (extend)
665*4882a593Smuzhiyun ret = ext4_handle_inode_extension(inode, offset, ret, count);
666*4882a593Smuzhiyun out:
667*4882a593Smuzhiyun inode_unlock(inode);
668*4882a593Smuzhiyun if (ret > 0)
669*4882a593Smuzhiyun ret = generic_write_sync(iocb, ret);
670*4882a593Smuzhiyun return ret;
671*4882a593Smuzhiyun }
672*4882a593Smuzhiyun #endif
673*4882a593Smuzhiyun
674*4882a593Smuzhiyun static ssize_t
ext4_file_write_iter(struct kiocb * iocb,struct iov_iter * from)675*4882a593Smuzhiyun ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
676*4882a593Smuzhiyun {
677*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
680*4882a593Smuzhiyun return -EIO;
681*4882a593Smuzhiyun
682*4882a593Smuzhiyun #ifdef CONFIG_FS_DAX
683*4882a593Smuzhiyun if (IS_DAX(inode))
684*4882a593Smuzhiyun return ext4_dax_write_iter(iocb, from);
685*4882a593Smuzhiyun #endif
686*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_DIRECT)
687*4882a593Smuzhiyun return ext4_dio_write_iter(iocb, from);
688*4882a593Smuzhiyun else
689*4882a593Smuzhiyun return ext4_buffered_write_iter(iocb, from);
690*4882a593Smuzhiyun }
691*4882a593Smuzhiyun
692*4882a593Smuzhiyun #ifdef CONFIG_FS_DAX
ext4_dax_huge_fault(struct vm_fault * vmf,enum page_entry_size pe_size)693*4882a593Smuzhiyun static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
694*4882a593Smuzhiyun enum page_entry_size pe_size)
695*4882a593Smuzhiyun {
696*4882a593Smuzhiyun int error = 0;
697*4882a593Smuzhiyun vm_fault_t result;
698*4882a593Smuzhiyun int retries = 0;
699*4882a593Smuzhiyun handle_t *handle = NULL;
700*4882a593Smuzhiyun struct inode *inode = file_inode(vmf->vma->vm_file);
701*4882a593Smuzhiyun struct super_block *sb = inode->i_sb;
702*4882a593Smuzhiyun
703*4882a593Smuzhiyun /*
704*4882a593Smuzhiyun * We have to distinguish real writes from writes which will result in a
705*4882a593Smuzhiyun * COW page; COW writes should *not* poke the journal (the file will not
706*4882a593Smuzhiyun * be changed). Doing so would cause unintended failures when mounted
707*4882a593Smuzhiyun * read-only.
708*4882a593Smuzhiyun *
709*4882a593Smuzhiyun * We check for VM_SHARED rather than vmf->cow_page since the latter is
710*4882a593Smuzhiyun * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
711*4882a593Smuzhiyun * other sizes, dax_iomap_fault will handle splitting / fallback so that
712*4882a593Smuzhiyun * we eventually come back with a COW page.
713*4882a593Smuzhiyun */
714*4882a593Smuzhiyun bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
715*4882a593Smuzhiyun (vmf->vma->vm_flags & VM_SHARED);
716*4882a593Smuzhiyun pfn_t pfn;
717*4882a593Smuzhiyun
718*4882a593Smuzhiyun if (write) {
719*4882a593Smuzhiyun sb_start_pagefault(sb);
720*4882a593Smuzhiyun file_update_time(vmf->vma->vm_file);
721*4882a593Smuzhiyun down_read(&EXT4_I(inode)->i_mmap_sem);
722*4882a593Smuzhiyun retry:
723*4882a593Smuzhiyun handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
724*4882a593Smuzhiyun EXT4_DATA_TRANS_BLOCKS(sb));
725*4882a593Smuzhiyun if (IS_ERR(handle)) {
726*4882a593Smuzhiyun up_read(&EXT4_I(inode)->i_mmap_sem);
727*4882a593Smuzhiyun sb_end_pagefault(sb);
728*4882a593Smuzhiyun return VM_FAULT_SIGBUS;
729*4882a593Smuzhiyun }
730*4882a593Smuzhiyun } else {
731*4882a593Smuzhiyun down_read(&EXT4_I(inode)->i_mmap_sem);
732*4882a593Smuzhiyun }
733*4882a593Smuzhiyun result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
734*4882a593Smuzhiyun if (write) {
735*4882a593Smuzhiyun ext4_journal_stop(handle);
736*4882a593Smuzhiyun
737*4882a593Smuzhiyun if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
738*4882a593Smuzhiyun ext4_should_retry_alloc(sb, &retries))
739*4882a593Smuzhiyun goto retry;
740*4882a593Smuzhiyun /* Handling synchronous page fault? */
741*4882a593Smuzhiyun if (result & VM_FAULT_NEEDDSYNC)
742*4882a593Smuzhiyun result = dax_finish_sync_fault(vmf, pe_size, pfn);
743*4882a593Smuzhiyun up_read(&EXT4_I(inode)->i_mmap_sem);
744*4882a593Smuzhiyun sb_end_pagefault(sb);
745*4882a593Smuzhiyun } else {
746*4882a593Smuzhiyun up_read(&EXT4_I(inode)->i_mmap_sem);
747*4882a593Smuzhiyun }
748*4882a593Smuzhiyun
749*4882a593Smuzhiyun return result;
750*4882a593Smuzhiyun }
751*4882a593Smuzhiyun
ext4_dax_fault(struct vm_fault * vmf)752*4882a593Smuzhiyun static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
753*4882a593Smuzhiyun {
754*4882a593Smuzhiyun return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
755*4882a593Smuzhiyun }
756*4882a593Smuzhiyun
757*4882a593Smuzhiyun static const struct vm_operations_struct ext4_dax_vm_ops = {
758*4882a593Smuzhiyun .fault = ext4_dax_fault,
759*4882a593Smuzhiyun .huge_fault = ext4_dax_huge_fault,
760*4882a593Smuzhiyun .page_mkwrite = ext4_dax_fault,
761*4882a593Smuzhiyun .pfn_mkwrite = ext4_dax_fault,
762*4882a593Smuzhiyun };
763*4882a593Smuzhiyun #else
764*4882a593Smuzhiyun #define ext4_dax_vm_ops ext4_file_vm_ops
765*4882a593Smuzhiyun #endif
766*4882a593Smuzhiyun
767*4882a593Smuzhiyun static const struct vm_operations_struct ext4_file_vm_ops = {
768*4882a593Smuzhiyun .fault = ext4_filemap_fault,
769*4882a593Smuzhiyun .map_pages = filemap_map_pages,
770*4882a593Smuzhiyun .page_mkwrite = ext4_page_mkwrite,
771*4882a593Smuzhiyun #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
772*4882a593Smuzhiyun .allow_speculation = filemap_allow_speculation,
773*4882a593Smuzhiyun #endif
774*4882a593Smuzhiyun };
775*4882a593Smuzhiyun
ext4_file_mmap(struct file * file,struct vm_area_struct * vma)776*4882a593Smuzhiyun static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
777*4882a593Smuzhiyun {
778*4882a593Smuzhiyun struct inode *inode = file->f_mapping->host;
779*4882a593Smuzhiyun struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
780*4882a593Smuzhiyun struct dax_device *dax_dev = sbi->s_daxdev;
781*4882a593Smuzhiyun
782*4882a593Smuzhiyun if (unlikely(ext4_forced_shutdown(sbi)))
783*4882a593Smuzhiyun return -EIO;
784*4882a593Smuzhiyun
785*4882a593Smuzhiyun /*
786*4882a593Smuzhiyun * We don't support synchronous mappings for non-DAX files and
787*4882a593Smuzhiyun * for DAX files if underneath dax_device is not synchronous.
788*4882a593Smuzhiyun */
789*4882a593Smuzhiyun if (!daxdev_mapping_supported(vma, dax_dev))
790*4882a593Smuzhiyun return -EOPNOTSUPP;
791*4882a593Smuzhiyun
792*4882a593Smuzhiyun file_accessed(file);
793*4882a593Smuzhiyun if (IS_DAX(file_inode(file))) {
794*4882a593Smuzhiyun vma->vm_ops = &ext4_dax_vm_ops;
795*4882a593Smuzhiyun vma->vm_flags |= VM_HUGEPAGE;
796*4882a593Smuzhiyun } else {
797*4882a593Smuzhiyun vma->vm_ops = &ext4_file_vm_ops;
798*4882a593Smuzhiyun }
799*4882a593Smuzhiyun return 0;
800*4882a593Smuzhiyun }
801*4882a593Smuzhiyun
ext4_sample_last_mounted(struct super_block * sb,struct vfsmount * mnt)802*4882a593Smuzhiyun static int ext4_sample_last_mounted(struct super_block *sb,
803*4882a593Smuzhiyun struct vfsmount *mnt)
804*4882a593Smuzhiyun {
805*4882a593Smuzhiyun struct ext4_sb_info *sbi = EXT4_SB(sb);
806*4882a593Smuzhiyun struct path path;
807*4882a593Smuzhiyun char buf[64], *cp;
808*4882a593Smuzhiyun handle_t *handle;
809*4882a593Smuzhiyun int err;
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
812*4882a593Smuzhiyun return 0;
813*4882a593Smuzhiyun
814*4882a593Smuzhiyun if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
815*4882a593Smuzhiyun return 0;
816*4882a593Smuzhiyun
817*4882a593Smuzhiyun ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
818*4882a593Smuzhiyun /*
819*4882a593Smuzhiyun * Sample where the filesystem has been mounted and
820*4882a593Smuzhiyun * store it in the superblock for sysadmin convenience
821*4882a593Smuzhiyun * when trying to sort through large numbers of block
822*4882a593Smuzhiyun * devices or filesystem images.
823*4882a593Smuzhiyun */
824*4882a593Smuzhiyun memset(buf, 0, sizeof(buf));
825*4882a593Smuzhiyun path.mnt = mnt;
826*4882a593Smuzhiyun path.dentry = mnt->mnt_root;
827*4882a593Smuzhiyun cp = d_path(&path, buf, sizeof(buf));
828*4882a593Smuzhiyun err = 0;
829*4882a593Smuzhiyun if (IS_ERR(cp))
830*4882a593Smuzhiyun goto out;
831*4882a593Smuzhiyun
832*4882a593Smuzhiyun handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
833*4882a593Smuzhiyun err = PTR_ERR(handle);
834*4882a593Smuzhiyun if (IS_ERR(handle))
835*4882a593Smuzhiyun goto out;
836*4882a593Smuzhiyun BUFFER_TRACE(sbi->s_sbh, "get_write_access");
837*4882a593Smuzhiyun err = ext4_journal_get_write_access(handle, sbi->s_sbh);
838*4882a593Smuzhiyun if (err)
839*4882a593Smuzhiyun goto out_journal;
840*4882a593Smuzhiyun strncpy(sbi->s_es->s_last_mounted, cp,
841*4882a593Smuzhiyun sizeof(sbi->s_es->s_last_mounted));
842*4882a593Smuzhiyun ext4_handle_dirty_super(handle, sb);
843*4882a593Smuzhiyun out_journal:
844*4882a593Smuzhiyun ext4_journal_stop(handle);
845*4882a593Smuzhiyun out:
846*4882a593Smuzhiyun sb_end_intwrite(sb);
847*4882a593Smuzhiyun return err;
848*4882a593Smuzhiyun }
849*4882a593Smuzhiyun
ext4_file_open(struct inode * inode,struct file * filp)850*4882a593Smuzhiyun static int ext4_file_open(struct inode *inode, struct file *filp)
851*4882a593Smuzhiyun {
852*4882a593Smuzhiyun int ret;
853*4882a593Smuzhiyun
854*4882a593Smuzhiyun if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
855*4882a593Smuzhiyun return -EIO;
856*4882a593Smuzhiyun
857*4882a593Smuzhiyun ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
858*4882a593Smuzhiyun if (ret)
859*4882a593Smuzhiyun return ret;
860*4882a593Smuzhiyun
861*4882a593Smuzhiyun ret = fscrypt_file_open(inode, filp);
862*4882a593Smuzhiyun if (ret)
863*4882a593Smuzhiyun return ret;
864*4882a593Smuzhiyun
865*4882a593Smuzhiyun ret = fsverity_file_open(inode, filp);
866*4882a593Smuzhiyun if (ret)
867*4882a593Smuzhiyun return ret;
868*4882a593Smuzhiyun
869*4882a593Smuzhiyun /*
870*4882a593Smuzhiyun * Set up the jbd2_inode if we are opening the inode for
871*4882a593Smuzhiyun * writing and the journal is present
872*4882a593Smuzhiyun */
873*4882a593Smuzhiyun if (filp->f_mode & FMODE_WRITE) {
874*4882a593Smuzhiyun ret = ext4_inode_attach_jinode(inode);
875*4882a593Smuzhiyun if (ret < 0)
876*4882a593Smuzhiyun return ret;
877*4882a593Smuzhiyun }
878*4882a593Smuzhiyun
879*4882a593Smuzhiyun filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
880*4882a593Smuzhiyun return dquot_file_open(inode, filp);
881*4882a593Smuzhiyun }
882*4882a593Smuzhiyun
883*4882a593Smuzhiyun /*
884*4882a593Smuzhiyun * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
885*4882a593Smuzhiyun * by calling generic_file_llseek_size() with the appropriate maxbytes
886*4882a593Smuzhiyun * value for each.
887*4882a593Smuzhiyun */
ext4_llseek(struct file * file,loff_t offset,int whence)888*4882a593Smuzhiyun loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
889*4882a593Smuzhiyun {
890*4882a593Smuzhiyun struct inode *inode = file->f_mapping->host;
891*4882a593Smuzhiyun loff_t maxbytes;
892*4882a593Smuzhiyun
893*4882a593Smuzhiyun if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
894*4882a593Smuzhiyun maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
895*4882a593Smuzhiyun else
896*4882a593Smuzhiyun maxbytes = inode->i_sb->s_maxbytes;
897*4882a593Smuzhiyun
898*4882a593Smuzhiyun switch (whence) {
899*4882a593Smuzhiyun default:
900*4882a593Smuzhiyun return generic_file_llseek_size(file, offset, whence,
901*4882a593Smuzhiyun maxbytes, i_size_read(inode));
902*4882a593Smuzhiyun case SEEK_HOLE:
903*4882a593Smuzhiyun inode_lock_shared(inode);
904*4882a593Smuzhiyun offset = iomap_seek_hole(inode, offset,
905*4882a593Smuzhiyun &ext4_iomap_report_ops);
906*4882a593Smuzhiyun inode_unlock_shared(inode);
907*4882a593Smuzhiyun break;
908*4882a593Smuzhiyun case SEEK_DATA:
909*4882a593Smuzhiyun inode_lock_shared(inode);
910*4882a593Smuzhiyun offset = iomap_seek_data(inode, offset,
911*4882a593Smuzhiyun &ext4_iomap_report_ops);
912*4882a593Smuzhiyun inode_unlock_shared(inode);
913*4882a593Smuzhiyun break;
914*4882a593Smuzhiyun }
915*4882a593Smuzhiyun
916*4882a593Smuzhiyun if (offset < 0)
917*4882a593Smuzhiyun return offset;
918*4882a593Smuzhiyun return vfs_setpos(file, offset, maxbytes);
919*4882a593Smuzhiyun }
920*4882a593Smuzhiyun
921*4882a593Smuzhiyun const struct file_operations ext4_file_operations = {
922*4882a593Smuzhiyun .llseek = ext4_llseek,
923*4882a593Smuzhiyun .read_iter = ext4_file_read_iter,
924*4882a593Smuzhiyun .write_iter = ext4_file_write_iter,
925*4882a593Smuzhiyun .iopoll = iomap_dio_iopoll,
926*4882a593Smuzhiyun .unlocked_ioctl = ext4_ioctl,
927*4882a593Smuzhiyun #ifdef CONFIG_COMPAT
928*4882a593Smuzhiyun .compat_ioctl = ext4_compat_ioctl,
929*4882a593Smuzhiyun #endif
930*4882a593Smuzhiyun .mmap = ext4_file_mmap,
931*4882a593Smuzhiyun .mmap_supported_flags = MAP_SYNC,
932*4882a593Smuzhiyun .open = ext4_file_open,
933*4882a593Smuzhiyun .release = ext4_release_file,
934*4882a593Smuzhiyun .fsync = ext4_sync_file,
935*4882a593Smuzhiyun .get_unmapped_area = thp_get_unmapped_area,
936*4882a593Smuzhiyun .splice_read = generic_file_splice_read,
937*4882a593Smuzhiyun .splice_write = iter_file_splice_write,
938*4882a593Smuzhiyun .fallocate = ext4_fallocate,
939*4882a593Smuzhiyun };
940*4882a593Smuzhiyun
941*4882a593Smuzhiyun const struct inode_operations ext4_file_inode_operations = {
942*4882a593Smuzhiyun .setattr = ext4_setattr,
943*4882a593Smuzhiyun .getattr = ext4_file_getattr,
944*4882a593Smuzhiyun .listxattr = ext4_listxattr,
945*4882a593Smuzhiyun .get_acl = ext4_get_acl,
946*4882a593Smuzhiyun .set_acl = ext4_set_acl,
947*4882a593Smuzhiyun .fiemap = ext4_fiemap,
948*4882a593Smuzhiyun };
949*4882a593Smuzhiyun
950