xref: /OK3568_Linux_fs/kernel/fs/ext4/file.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  linux/fs/ext4/file.c
4  *
5  * Copyright (C) 1992, 1993, 1994, 1995
6  * Remy Card (card@masi.ibp.fr)
7  * Laboratoire MASI - Institut Blaise Pascal
8  * Universite Pierre et Marie Curie (Paris VI)
9  *
10  *  from
11  *
12  *  linux/fs/minix/file.c
13  *
14  *  Copyright (C) 1991, 1992  Linus Torvalds
15  *
16  *  ext4 fs regular file handling primitives
17  *
18  *  64-bit file support on 64-bit platforms by Jakub Jelinek
19  *	(jj@sunsite.ms.mff.cuni.cz)
20  */
21 
22 #include <linux/time.h>
23 #include <linux/fs.h>
24 #include <linux/iomap.h>
25 #include <linux/mount.h>
26 #include <linux/path.h>
27 #include <linux/dax.h>
28 #include <linux/quotaops.h>
29 #include <linux/pagevec.h>
30 #include <linux/uio.h>
31 #include <linux/mman.h>
32 #include <linux/backing-dev.h>
33 #include "ext4.h"
34 #include "ext4_jbd2.h"
35 #include "xattr.h"
36 #include "acl.h"
37 #include "truncate.h"
38 
ext4_dio_supported(struct kiocb * iocb,struct iov_iter * iter)39 static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
40 {
41 	struct inode *inode = file_inode(iocb->ki_filp);
42 
43 	if (!fscrypt_dio_supported(iocb, iter))
44 		return false;
45 	if (fsverity_active(inode))
46 		return false;
47 	if (ext4_should_journal_data(inode))
48 		return false;
49 	if (ext4_has_inline_data(inode))
50 		return false;
51 	return true;
52 }
53 
ext4_dio_read_iter(struct kiocb * iocb,struct iov_iter * to)54 static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
55 {
56 	ssize_t ret;
57 	struct inode *inode = file_inode(iocb->ki_filp);
58 
59 	if (iocb->ki_flags & IOCB_NOWAIT) {
60 		if (!inode_trylock_shared(inode))
61 			return -EAGAIN;
62 	} else {
63 		inode_lock_shared(inode);
64 	}
65 
66 	if (!ext4_dio_supported(iocb, to)) {
67 		inode_unlock_shared(inode);
68 		/*
69 		 * Fallback to buffered I/O if the operation being performed on
70 		 * the inode is not supported by direct I/O. The IOCB_DIRECT
71 		 * flag needs to be cleared here in order to ensure that the
72 		 * direct I/O path within generic_file_read_iter() is not
73 		 * taken.
74 		 */
75 		iocb->ki_flags &= ~IOCB_DIRECT;
76 		return generic_file_read_iter(iocb, to);
77 	}
78 
79 	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
80 			   is_sync_kiocb(iocb));
81 	inode_unlock_shared(inode);
82 
83 	file_accessed(iocb->ki_filp);
84 	return ret;
85 }
86 
87 #ifdef CONFIG_FS_DAX
ext4_dax_read_iter(struct kiocb * iocb,struct iov_iter * to)88 static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
89 {
90 	struct inode *inode = file_inode(iocb->ki_filp);
91 	ssize_t ret;
92 
93 	if (iocb->ki_flags & IOCB_NOWAIT) {
94 		if (!inode_trylock_shared(inode))
95 			return -EAGAIN;
96 	} else {
97 		inode_lock_shared(inode);
98 	}
99 	/*
100 	 * Recheck under inode lock - at this point we are sure it cannot
101 	 * change anymore
102 	 */
103 	if (!IS_DAX(inode)) {
104 		inode_unlock_shared(inode);
105 		/* Fallback to buffered IO in case we cannot support DAX */
106 		return generic_file_read_iter(iocb, to);
107 	}
108 	ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
109 	inode_unlock_shared(inode);
110 
111 	file_accessed(iocb->ki_filp);
112 	return ret;
113 }
114 #endif
115 
ext4_file_read_iter(struct kiocb * iocb,struct iov_iter * to)116 static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
117 {
118 	struct inode *inode = file_inode(iocb->ki_filp);
119 
120 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
121 		return -EIO;
122 
123 	if (!iov_iter_count(to))
124 		return 0; /* skip atime */
125 
126 #ifdef CONFIG_FS_DAX
127 	if (IS_DAX(inode))
128 		return ext4_dax_read_iter(iocb, to);
129 #endif
130 	if (iocb->ki_flags & IOCB_DIRECT)
131 		return ext4_dio_read_iter(iocb, to);
132 
133 	return generic_file_read_iter(iocb, to);
134 }
135 
136 /*
137  * Called when an inode is released. Note that this is different
138  * from ext4_file_open: open gets called at every open, but release
139  * gets called only when /all/ the files are closed.
140  */
ext4_release_file(struct inode * inode,struct file * filp)141 static int ext4_release_file(struct inode *inode, struct file *filp)
142 {
143 	if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
144 		ext4_alloc_da_blocks(inode);
145 		ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
146 	}
147 	/* if we are the last writer on the inode, drop the block reservation */
148 	if ((filp->f_mode & FMODE_WRITE) &&
149 			(atomic_read(&inode->i_writecount) == 1) &&
150 			!EXT4_I(inode)->i_reserved_data_blocks) {
151 		down_write(&EXT4_I(inode)->i_data_sem);
152 		ext4_discard_preallocations(inode, 0);
153 		up_write(&EXT4_I(inode)->i_data_sem);
154 	}
155 	if (is_dx(inode) && filp->private_data)
156 		ext4_htree_free_dir_info(filp->private_data);
157 
158 	return 0;
159 }
160 
161 /*
162  * This tests whether the IO in question is block-aligned or not.
163  * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
164  * are converted to written only after the IO is complete.  Until they are
165  * mapped, these blocks appear as holes, so dio_zero_block() will assume that
166  * it needs to zero out portions of the start and/or end block.  If 2 AIO
167  * threads are at work on the same unwritten block, they must be synchronized
168  * or one thread will zero the other's data, causing corruption.
169  */
170 static bool
ext4_unaligned_io(struct inode * inode,struct iov_iter * from,loff_t pos)171 ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
172 {
173 	struct super_block *sb = inode->i_sb;
174 	unsigned long blockmask = sb->s_blocksize - 1;
175 
176 	if ((pos | iov_iter_alignment(from)) & blockmask)
177 		return true;
178 
179 	return false;
180 }
181 
182 static bool
ext4_extending_io(struct inode * inode,loff_t offset,size_t len)183 ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
184 {
185 	if (offset + len > i_size_read(inode) ||
186 	    offset + len > EXT4_I(inode)->i_disksize)
187 		return true;
188 	return false;
189 }
190 
191 /* Is IO overwriting allocated and initialized blocks? */
ext4_overwrite_io(struct inode * inode,loff_t pos,loff_t len)192 static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
193 {
194 	struct ext4_map_blocks map;
195 	unsigned int blkbits = inode->i_blkbits;
196 	int err, blklen;
197 
198 	if (pos + len > i_size_read(inode))
199 		return false;
200 
201 	map.m_lblk = pos >> blkbits;
202 	map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
203 	blklen = map.m_len;
204 
205 	err = ext4_map_blocks(NULL, inode, &map, 0);
206 	/*
207 	 * 'err==len' means that all of the blocks have been preallocated,
208 	 * regardless of whether they have been initialized or not. To exclude
209 	 * unwritten extents, we need to check m_flags.
210 	 */
211 	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
212 }
213 
ext4_generic_write_checks(struct kiocb * iocb,struct iov_iter * from)214 static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
215 					 struct iov_iter *from)
216 {
217 	struct inode *inode = file_inode(iocb->ki_filp);
218 	ssize_t ret;
219 
220 	if (unlikely(IS_IMMUTABLE(inode)))
221 		return -EPERM;
222 
223 	ret = generic_write_checks(iocb, from);
224 	if (ret <= 0)
225 		return ret;
226 
227 	/*
228 	 * If we have encountered a bitmap-format file, the size limit
229 	 * is smaller than s_maxbytes, which is for extent-mapped files.
230 	 */
231 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
232 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
233 
234 		if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
235 			return -EFBIG;
236 		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
237 	}
238 
239 	return iov_iter_count(from);
240 }
241 
ext4_write_checks(struct kiocb * iocb,struct iov_iter * from)242 static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
243 {
244 	ssize_t ret, count;
245 
246 	count = ext4_generic_write_checks(iocb, from);
247 	if (count <= 0)
248 		return count;
249 
250 	ret = file_modified(iocb->ki_filp);
251 	if (ret)
252 		return ret;
253 	return count;
254 }
255 
ext4_buffered_write_iter(struct kiocb * iocb,struct iov_iter * from)256 static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
257 					struct iov_iter *from)
258 {
259 	ssize_t ret;
260 	struct inode *inode = file_inode(iocb->ki_filp);
261 
262 	if (iocb->ki_flags & IOCB_NOWAIT)
263 		return -EOPNOTSUPP;
264 
265 	ext4_fc_start_update(inode);
266 	inode_lock(inode);
267 	ret = ext4_write_checks(iocb, from);
268 	if (ret <= 0)
269 		goto out;
270 
271 	current->backing_dev_info = inode_to_bdi(inode);
272 	ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
273 	current->backing_dev_info = NULL;
274 
275 out:
276 	inode_unlock(inode);
277 	ext4_fc_stop_update(inode);
278 	if (likely(ret > 0)) {
279 		iocb->ki_pos += ret;
280 		ret = generic_write_sync(iocb, ret);
281 	}
282 
283 	return ret;
284 }
285 
ext4_handle_inode_extension(struct inode * inode,loff_t offset,ssize_t written,size_t count)286 static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
287 					   ssize_t written, size_t count)
288 {
289 	handle_t *handle;
290 	bool truncate = false;
291 	u8 blkbits = inode->i_blkbits;
292 	ext4_lblk_t written_blk, end_blk;
293 	int ret;
294 
295 	/*
296 	 * Note that EXT4_I(inode)->i_disksize can get extended up to
297 	 * inode->i_size while the I/O was running due to writeback of delalloc
298 	 * blocks. But, the code in ext4_iomap_alloc() is careful to use
299 	 * zeroed/unwritten extents if this is possible; thus we won't leave
300 	 * uninitialized blocks in a file even if we didn't succeed in writing
301 	 * as much as we intended.
302 	 */
303 	WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
304 	if (offset + count <= EXT4_I(inode)->i_disksize) {
305 		/*
306 		 * We need to ensure that the inode is removed from the orphan
307 		 * list if it has been added prematurely, due to writeback of
308 		 * delalloc blocks.
309 		 */
310 		if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
311 			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
312 
313 			if (IS_ERR(handle)) {
314 				ext4_orphan_del(NULL, inode);
315 				return PTR_ERR(handle);
316 			}
317 
318 			ext4_orphan_del(handle, inode);
319 			ext4_journal_stop(handle);
320 		}
321 
322 		return written;
323 	}
324 
325 	if (written < 0)
326 		goto truncate;
327 
328 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
329 	if (IS_ERR(handle)) {
330 		written = PTR_ERR(handle);
331 		goto truncate;
332 	}
333 
334 	if (ext4_update_inode_size(inode, offset + written)) {
335 		ret = ext4_mark_inode_dirty(handle, inode);
336 		if (unlikely(ret)) {
337 			written = ret;
338 			ext4_journal_stop(handle);
339 			goto truncate;
340 		}
341 	}
342 
343 	/*
344 	 * We may need to truncate allocated but not written blocks beyond EOF.
345 	 */
346 	written_blk = ALIGN(offset + written, 1 << blkbits);
347 	end_blk = ALIGN(offset + count, 1 << blkbits);
348 	if (written_blk < end_blk && ext4_can_truncate(inode))
349 		truncate = true;
350 
351 	/*
352 	 * Remove the inode from the orphan list if it has been extended and
353 	 * everything went OK.
354 	 */
355 	if (!truncate && inode->i_nlink)
356 		ext4_orphan_del(handle, inode);
357 	ext4_journal_stop(handle);
358 
359 	if (truncate) {
360 truncate:
361 		ext4_truncate_failed_write(inode);
362 		/*
363 		 * If the truncate operation failed early, then the inode may
364 		 * still be on the orphan list. In that case, we need to try
365 		 * remove the inode from the in-memory linked list.
366 		 */
367 		if (inode->i_nlink)
368 			ext4_orphan_del(NULL, inode);
369 	}
370 
371 	return written;
372 }
373 
ext4_dio_write_end_io(struct kiocb * iocb,ssize_t size,int error,unsigned int flags)374 static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
375 				 int error, unsigned int flags)
376 {
377 	loff_t pos = iocb->ki_pos;
378 	struct inode *inode = file_inode(iocb->ki_filp);
379 
380 	if (error)
381 		return error;
382 
383 	if (size && flags & IOMAP_DIO_UNWRITTEN) {
384 		error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
385 		if (error < 0)
386 			return error;
387 	}
388 	/*
389 	 * If we are extending the file, we have to update i_size here before
390 	 * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
391 	 * buffered reads could zero out too much from page cache pages. Update
392 	 * of on-disk size will happen later in ext4_dio_write_iter() where
393 	 * we have enough information to also perform orphan list handling etc.
394 	 * Note that we perform all extending writes synchronously under
395 	 * i_rwsem held exclusively so i_size update is safe here in that case.
396 	 * If the write was not extending, we cannot see pos > i_size here
397 	 * because operations reducing i_size like truncate wait for all
398 	 * outstanding DIO before updating i_size.
399 	 */
400 	pos += size;
401 	if (pos > i_size_read(inode))
402 		i_size_write(inode, pos);
403 
404 	return 0;
405 }
406 
407 static const struct iomap_dio_ops ext4_dio_write_ops = {
408 	.end_io = ext4_dio_write_end_io,
409 };
410 
411 /*
412  * The intention here is to start with shared lock acquired then see if any
413  * condition requires an exclusive inode lock. If yes, then we restart the
414  * whole operation by releasing the shared lock and acquiring exclusive lock.
415  *
416  * - For unaligned_io we never take shared lock as it may cause data corruption
417  *   when two unaligned IO tries to modify the same block e.g. while zeroing.
418  *
419  * - For extending writes case we don't take the shared lock, since it requires
420  *   updating inode i_disksize and/or orphan handling with exclusive lock.
421  *
422  * - shared locking will only be true mostly with overwrites. Otherwise we will
423  *   switch to exclusive i_rwsem lock.
424  */
ext4_dio_write_checks(struct kiocb * iocb,struct iov_iter * from,bool * ilock_shared,bool * extend)425 static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
426 				     bool *ilock_shared, bool *extend)
427 {
428 	struct file *file = iocb->ki_filp;
429 	struct inode *inode = file_inode(file);
430 	loff_t offset;
431 	size_t count;
432 	ssize_t ret;
433 
434 restart:
435 	ret = ext4_generic_write_checks(iocb, from);
436 	if (ret <= 0)
437 		goto out;
438 
439 	offset = iocb->ki_pos;
440 	count = ret;
441 	if (ext4_extending_io(inode, offset, count))
442 		*extend = true;
443 	/*
444 	 * Determine whether the IO operation will overwrite allocated
445 	 * and initialized blocks.
446 	 * We need exclusive i_rwsem for changing security info
447 	 * in file_modified().
448 	 */
449 	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
450 	     !ext4_overwrite_io(inode, offset, count))) {
451 		if (iocb->ki_flags & IOCB_NOWAIT) {
452 			ret = -EAGAIN;
453 			goto out;
454 		}
455 		inode_unlock_shared(inode);
456 		*ilock_shared = false;
457 		inode_lock(inode);
458 		goto restart;
459 	}
460 
461 	ret = file_modified(file);
462 	if (ret < 0)
463 		goto out;
464 
465 	return count;
466 out:
467 	if (*ilock_shared)
468 		inode_unlock_shared(inode);
469 	else
470 		inode_unlock(inode);
471 	return ret;
472 }
473 
ext4_dio_write_iter(struct kiocb * iocb,struct iov_iter * from)474 static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
475 {
476 	ssize_t ret;
477 	handle_t *handle;
478 	struct inode *inode = file_inode(iocb->ki_filp);
479 	loff_t offset = iocb->ki_pos;
480 	size_t count = iov_iter_count(from);
481 	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
482 	bool extend = false, unaligned_io = false;
483 	bool ilock_shared = true;
484 
485 	/*
486 	 * We initially start with shared inode lock unless it is
487 	 * unaligned IO which needs exclusive lock anyways.
488 	 */
489 	if (ext4_unaligned_io(inode, from, offset)) {
490 		unaligned_io = true;
491 		ilock_shared = false;
492 	}
493 	/*
494 	 * Quick check here without any i_rwsem lock to see if it is extending
495 	 * IO. A more reliable check is done in ext4_dio_write_checks() with
496 	 * proper locking in place.
497 	 */
498 	if (offset + count > i_size_read(inode))
499 		ilock_shared = false;
500 
501 	if (iocb->ki_flags & IOCB_NOWAIT) {
502 		if (ilock_shared) {
503 			if (!inode_trylock_shared(inode))
504 				return -EAGAIN;
505 		} else {
506 			if (!inode_trylock(inode))
507 				return -EAGAIN;
508 		}
509 	} else {
510 		if (ilock_shared)
511 			inode_lock_shared(inode);
512 		else
513 			inode_lock(inode);
514 	}
515 
516 	/* Fallback to buffered I/O if the inode does not support direct I/O. */
517 	if (!ext4_dio_supported(iocb, from)) {
518 		if (ilock_shared)
519 			inode_unlock_shared(inode);
520 		else
521 			inode_unlock(inode);
522 		return ext4_buffered_write_iter(iocb, from);
523 	}
524 
525 	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
526 	if (ret <= 0)
527 		return ret;
528 
529 	/* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
530 	if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
531 		ret = -EAGAIN;
532 		goto out;
533 	}
534 	/*
535 	 * Make sure inline data cannot be created anymore since we are going
536 	 * to allocate blocks for DIO. We know the inode does not have any
537 	 * inline data now because ext4_dio_supported() checked for that.
538 	 */
539 	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
540 
541 	offset = iocb->ki_pos;
542 	count = ret;
543 
544 	/*
545 	 * Unaligned direct IO must be serialized among each other as zeroing
546 	 * of partial blocks of two competing unaligned IOs can result in data
547 	 * corruption.
548 	 *
549 	 * So we make sure we don't allow any unaligned IO in flight.
550 	 * For IOs where we need not wait (like unaligned non-AIO DIO),
551 	 * below inode_dio_wait() may anyway become a no-op, since we start
552 	 * with exclusive lock.
553 	 */
554 	if (unaligned_io)
555 		inode_dio_wait(inode);
556 
557 	if (extend) {
558 		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
559 		if (IS_ERR(handle)) {
560 			ret = PTR_ERR(handle);
561 			goto out;
562 		}
563 
564 		ext4_fc_start_update(inode);
565 		ret = ext4_orphan_add(handle, inode);
566 		ext4_fc_stop_update(inode);
567 		if (ret) {
568 			ext4_journal_stop(handle);
569 			goto out;
570 		}
571 
572 		ext4_journal_stop(handle);
573 	}
574 
575 	if (ilock_shared)
576 		iomap_ops = &ext4_iomap_overwrite_ops;
577 	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
578 			   is_sync_kiocb(iocb) || unaligned_io || extend);
579 	if (ret == -ENOTBLK)
580 		ret = 0;
581 
582 	if (extend)
583 		ret = ext4_handle_inode_extension(inode, offset, ret, count);
584 
585 out:
586 	if (ilock_shared)
587 		inode_unlock_shared(inode);
588 	else
589 		inode_unlock(inode);
590 
591 	if (ret >= 0 && iov_iter_count(from)) {
592 		ssize_t err;
593 		loff_t endbyte;
594 
595 		offset = iocb->ki_pos;
596 		err = ext4_buffered_write_iter(iocb, from);
597 		if (err < 0)
598 			return err;
599 
600 		/*
601 		 * We need to ensure that the pages within the page cache for
602 		 * the range covered by this I/O are written to disk and
603 		 * invalidated. This is in attempt to preserve the expected
604 		 * direct I/O semantics in the case we fallback to buffered I/O
605 		 * to complete off the I/O request.
606 		 */
607 		ret += err;
608 		endbyte = offset + err - 1;
609 		err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
610 						   offset, endbyte);
611 		if (!err)
612 			invalidate_mapping_pages(iocb->ki_filp->f_mapping,
613 						 offset >> PAGE_SHIFT,
614 						 endbyte >> PAGE_SHIFT);
615 	}
616 
617 	return ret;
618 }
619 
620 #ifdef CONFIG_FS_DAX
621 static ssize_t
ext4_dax_write_iter(struct kiocb * iocb,struct iov_iter * from)622 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
623 {
624 	ssize_t ret;
625 	size_t count;
626 	loff_t offset;
627 	handle_t *handle;
628 	bool extend = false;
629 	struct inode *inode = file_inode(iocb->ki_filp);
630 
631 	if (iocb->ki_flags & IOCB_NOWAIT) {
632 		if (!inode_trylock(inode))
633 			return -EAGAIN;
634 	} else {
635 		inode_lock(inode);
636 	}
637 
638 	ret = ext4_write_checks(iocb, from);
639 	if (ret <= 0)
640 		goto out;
641 
642 	offset = iocb->ki_pos;
643 	count = iov_iter_count(from);
644 
645 	if (offset + count > EXT4_I(inode)->i_disksize) {
646 		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
647 		if (IS_ERR(handle)) {
648 			ret = PTR_ERR(handle);
649 			goto out;
650 		}
651 
652 		ret = ext4_orphan_add(handle, inode);
653 		if (ret) {
654 			ext4_journal_stop(handle);
655 			goto out;
656 		}
657 
658 		extend = true;
659 		ext4_journal_stop(handle);
660 	}
661 
662 	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
663 
664 	if (extend)
665 		ret = ext4_handle_inode_extension(inode, offset, ret, count);
666 out:
667 	inode_unlock(inode);
668 	if (ret > 0)
669 		ret = generic_write_sync(iocb, ret);
670 	return ret;
671 }
672 #endif
673 
674 static ssize_t
ext4_file_write_iter(struct kiocb * iocb,struct iov_iter * from)675 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
676 {
677 	struct inode *inode = file_inode(iocb->ki_filp);
678 
679 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
680 		return -EIO;
681 
682 #ifdef CONFIG_FS_DAX
683 	if (IS_DAX(inode))
684 		return ext4_dax_write_iter(iocb, from);
685 #endif
686 	if (iocb->ki_flags & IOCB_DIRECT)
687 		return ext4_dio_write_iter(iocb, from);
688 	else
689 		return ext4_buffered_write_iter(iocb, from);
690 }
691 
692 #ifdef CONFIG_FS_DAX
ext4_dax_huge_fault(struct vm_fault * vmf,enum page_entry_size pe_size)693 static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
694 		enum page_entry_size pe_size)
695 {
696 	int error = 0;
697 	vm_fault_t result;
698 	int retries = 0;
699 	handle_t *handle = NULL;
700 	struct inode *inode = file_inode(vmf->vma->vm_file);
701 	struct super_block *sb = inode->i_sb;
702 
703 	/*
704 	 * We have to distinguish real writes from writes which will result in a
705 	 * COW page; COW writes should *not* poke the journal (the file will not
706 	 * be changed). Doing so would cause unintended failures when mounted
707 	 * read-only.
708 	 *
709 	 * We check for VM_SHARED rather than vmf->cow_page since the latter is
710 	 * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
711 	 * other sizes, dax_iomap_fault will handle splitting / fallback so that
712 	 * we eventually come back with a COW page.
713 	 */
714 	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
715 		(vmf->vma->vm_flags & VM_SHARED);
716 	pfn_t pfn;
717 
718 	if (write) {
719 		sb_start_pagefault(sb);
720 		file_update_time(vmf->vma->vm_file);
721 		down_read(&EXT4_I(inode)->i_mmap_sem);
722 retry:
723 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
724 					       EXT4_DATA_TRANS_BLOCKS(sb));
725 		if (IS_ERR(handle)) {
726 			up_read(&EXT4_I(inode)->i_mmap_sem);
727 			sb_end_pagefault(sb);
728 			return VM_FAULT_SIGBUS;
729 		}
730 	} else {
731 		down_read(&EXT4_I(inode)->i_mmap_sem);
732 	}
733 	result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
734 	if (write) {
735 		ext4_journal_stop(handle);
736 
737 		if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
738 		    ext4_should_retry_alloc(sb, &retries))
739 			goto retry;
740 		/* Handling synchronous page fault? */
741 		if (result & VM_FAULT_NEEDDSYNC)
742 			result = dax_finish_sync_fault(vmf, pe_size, pfn);
743 		up_read(&EXT4_I(inode)->i_mmap_sem);
744 		sb_end_pagefault(sb);
745 	} else {
746 		up_read(&EXT4_I(inode)->i_mmap_sem);
747 	}
748 
749 	return result;
750 }
751 
ext4_dax_fault(struct vm_fault * vmf)752 static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
753 {
754 	return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
755 }
756 
757 static const struct vm_operations_struct ext4_dax_vm_ops = {
758 	.fault		= ext4_dax_fault,
759 	.huge_fault	= ext4_dax_huge_fault,
760 	.page_mkwrite	= ext4_dax_fault,
761 	.pfn_mkwrite	= ext4_dax_fault,
762 };
763 #else
764 #define ext4_dax_vm_ops	ext4_file_vm_ops
765 #endif
766 
767 static const struct vm_operations_struct ext4_file_vm_ops = {
768 	.fault		= ext4_filemap_fault,
769 	.map_pages	= filemap_map_pages,
770 	.page_mkwrite   = ext4_page_mkwrite,
771 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
772 	.allow_speculation = filemap_allow_speculation,
773 #endif
774 };
775 
ext4_file_mmap(struct file * file,struct vm_area_struct * vma)776 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
777 {
778 	struct inode *inode = file->f_mapping->host;
779 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
780 	struct dax_device *dax_dev = sbi->s_daxdev;
781 
782 	if (unlikely(ext4_forced_shutdown(sbi)))
783 		return -EIO;
784 
785 	/*
786 	 * We don't support synchronous mappings for non-DAX files and
787 	 * for DAX files if underneath dax_device is not synchronous.
788 	 */
789 	if (!daxdev_mapping_supported(vma, dax_dev))
790 		return -EOPNOTSUPP;
791 
792 	file_accessed(file);
793 	if (IS_DAX(file_inode(file))) {
794 		vma->vm_ops = &ext4_dax_vm_ops;
795 		vma->vm_flags |= VM_HUGEPAGE;
796 	} else {
797 		vma->vm_ops = &ext4_file_vm_ops;
798 	}
799 	return 0;
800 }
801 
ext4_sample_last_mounted(struct super_block * sb,struct vfsmount * mnt)802 static int ext4_sample_last_mounted(struct super_block *sb,
803 				    struct vfsmount *mnt)
804 {
805 	struct ext4_sb_info *sbi = EXT4_SB(sb);
806 	struct path path;
807 	char buf[64], *cp;
808 	handle_t *handle;
809 	int err;
810 
811 	if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
812 		return 0;
813 
814 	if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
815 		return 0;
816 
817 	ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
818 	/*
819 	 * Sample where the filesystem has been mounted and
820 	 * store it in the superblock for sysadmin convenience
821 	 * when trying to sort through large numbers of block
822 	 * devices or filesystem images.
823 	 */
824 	memset(buf, 0, sizeof(buf));
825 	path.mnt = mnt;
826 	path.dentry = mnt->mnt_root;
827 	cp = d_path(&path, buf, sizeof(buf));
828 	err = 0;
829 	if (IS_ERR(cp))
830 		goto out;
831 
832 	handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
833 	err = PTR_ERR(handle);
834 	if (IS_ERR(handle))
835 		goto out;
836 	BUFFER_TRACE(sbi->s_sbh, "get_write_access");
837 	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
838 	if (err)
839 		goto out_journal;
840 	strncpy(sbi->s_es->s_last_mounted, cp,
841 		sizeof(sbi->s_es->s_last_mounted));
842 	ext4_handle_dirty_super(handle, sb);
843 out_journal:
844 	ext4_journal_stop(handle);
845 out:
846 	sb_end_intwrite(sb);
847 	return err;
848 }
849 
ext4_file_open(struct inode * inode,struct file * filp)850 static int ext4_file_open(struct inode *inode, struct file *filp)
851 {
852 	int ret;
853 
854 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
855 		return -EIO;
856 
857 	ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
858 	if (ret)
859 		return ret;
860 
861 	ret = fscrypt_file_open(inode, filp);
862 	if (ret)
863 		return ret;
864 
865 	ret = fsverity_file_open(inode, filp);
866 	if (ret)
867 		return ret;
868 
869 	/*
870 	 * Set up the jbd2_inode if we are opening the inode for
871 	 * writing and the journal is present
872 	 */
873 	if (filp->f_mode & FMODE_WRITE) {
874 		ret = ext4_inode_attach_jinode(inode);
875 		if (ret < 0)
876 			return ret;
877 	}
878 
879 	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
880 	return dquot_file_open(inode, filp);
881 }
882 
883 /*
884  * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
885  * by calling generic_file_llseek_size() with the appropriate maxbytes
886  * value for each.
887  */
ext4_llseek(struct file * file,loff_t offset,int whence)888 loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
889 {
890 	struct inode *inode = file->f_mapping->host;
891 	loff_t maxbytes;
892 
893 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
894 		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
895 	else
896 		maxbytes = inode->i_sb->s_maxbytes;
897 
898 	switch (whence) {
899 	default:
900 		return generic_file_llseek_size(file, offset, whence,
901 						maxbytes, i_size_read(inode));
902 	case SEEK_HOLE:
903 		inode_lock_shared(inode);
904 		offset = iomap_seek_hole(inode, offset,
905 					 &ext4_iomap_report_ops);
906 		inode_unlock_shared(inode);
907 		break;
908 	case SEEK_DATA:
909 		inode_lock_shared(inode);
910 		offset = iomap_seek_data(inode, offset,
911 					 &ext4_iomap_report_ops);
912 		inode_unlock_shared(inode);
913 		break;
914 	}
915 
916 	if (offset < 0)
917 		return offset;
918 	return vfs_setpos(file, offset, maxbytes);
919 }
920 
921 const struct file_operations ext4_file_operations = {
922 	.llseek		= ext4_llseek,
923 	.read_iter	= ext4_file_read_iter,
924 	.write_iter	= ext4_file_write_iter,
925 	.iopoll		= iomap_dio_iopoll,
926 	.unlocked_ioctl = ext4_ioctl,
927 #ifdef CONFIG_COMPAT
928 	.compat_ioctl	= ext4_compat_ioctl,
929 #endif
930 	.mmap		= ext4_file_mmap,
931 	.mmap_supported_flags = MAP_SYNC,
932 	.open		= ext4_file_open,
933 	.release	= ext4_release_file,
934 	.fsync		= ext4_sync_file,
935 	.get_unmapped_area = thp_get_unmapped_area,
936 	.splice_read	= generic_file_splice_read,
937 	.splice_write	= iter_file_splice_write,
938 	.fallocate	= ext4_fallocate,
939 };
940 
941 const struct inode_operations ext4_file_inode_operations = {
942 	.setattr	= ext4_setattr,
943 	.getattr	= ext4_file_getattr,
944 	.listxattr	= ext4_listxattr,
945 	.get_acl	= ext4_get_acl,
946 	.set_acl	= ext4_set_acl,
947 	.fiemap		= ext4_fiemap,
948 };
949 
950