xref: /OK3568_Linux_fs/kernel/fs/jbd2/commit.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0+
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * linux/fs/jbd2/commit.c
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6*4882a593Smuzhiyun  *
7*4882a593Smuzhiyun  * Copyright 1998 Red Hat corp --- All Rights Reserved
8*4882a593Smuzhiyun  *
9*4882a593Smuzhiyun  * Journal commit routines for the generic filesystem journaling code;
10*4882a593Smuzhiyun  * part of the ext2fs journaling system.
11*4882a593Smuzhiyun  */
12*4882a593Smuzhiyun 
13*4882a593Smuzhiyun #include <linux/time.h>
14*4882a593Smuzhiyun #include <linux/fs.h>
15*4882a593Smuzhiyun #include <linux/jbd2.h>
16*4882a593Smuzhiyun #include <linux/errno.h>
17*4882a593Smuzhiyun #include <linux/slab.h>
18*4882a593Smuzhiyun #include <linux/mm.h>
19*4882a593Smuzhiyun #include <linux/pagemap.h>
20*4882a593Smuzhiyun #include <linux/jiffies.h>
21*4882a593Smuzhiyun #include <linux/crc32.h>
22*4882a593Smuzhiyun #include <linux/writeback.h>
23*4882a593Smuzhiyun #include <linux/backing-dev.h>
24*4882a593Smuzhiyun #include <linux/bio.h>
25*4882a593Smuzhiyun #include <linux/blkdev.h>
26*4882a593Smuzhiyun #include <linux/bitops.h>
27*4882a593Smuzhiyun #include <trace/events/jbd2.h>
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun /*
30*4882a593Smuzhiyun  * IO end handler for temporary buffer_heads handling writes to the journal.
31*4882a593Smuzhiyun  */
journal_end_buffer_io_sync(struct buffer_head * bh,int uptodate)32*4882a593Smuzhiyun static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33*4882a593Smuzhiyun {
34*4882a593Smuzhiyun 	struct buffer_head *orig_bh = bh->b_private;
35*4882a593Smuzhiyun 
36*4882a593Smuzhiyun 	BUFFER_TRACE(bh, "");
37*4882a593Smuzhiyun 	if (uptodate)
38*4882a593Smuzhiyun 		set_buffer_uptodate(bh);
39*4882a593Smuzhiyun 	else
40*4882a593Smuzhiyun 		clear_buffer_uptodate(bh);
41*4882a593Smuzhiyun 	if (orig_bh) {
42*4882a593Smuzhiyun 		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
43*4882a593Smuzhiyun 		smp_mb__after_atomic();
44*4882a593Smuzhiyun 		wake_up_bit(&orig_bh->b_state, BH_Shadow);
45*4882a593Smuzhiyun 	}
46*4882a593Smuzhiyun 	unlock_buffer(bh);
47*4882a593Smuzhiyun }
48*4882a593Smuzhiyun 
49*4882a593Smuzhiyun /*
50*4882a593Smuzhiyun  * When an ext4 file is truncated, it is possible that some pages are not
51*4882a593Smuzhiyun  * successfully freed, because they are attached to a committing transaction.
52*4882a593Smuzhiyun  * After the transaction commits, these pages are left on the LRU, with no
53*4882a593Smuzhiyun  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
54*4882a593Smuzhiyun  * by the VM, but their apparent absence upsets the VM accounting, and it makes
55*4882a593Smuzhiyun  * the numbers in /proc/meminfo look odd.
56*4882a593Smuzhiyun  *
57*4882a593Smuzhiyun  * So here, we have a buffer which has just come off the forget list.  Look to
58*4882a593Smuzhiyun  * see if we can strip all buffers from the backing page.
59*4882a593Smuzhiyun  *
60*4882a593Smuzhiyun  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
61*4882a593Smuzhiyun  * caller provided us with a ref against the buffer, and we drop that here.
62*4882a593Smuzhiyun  */
release_buffer_page(struct buffer_head * bh)63*4882a593Smuzhiyun static void release_buffer_page(struct buffer_head *bh)
64*4882a593Smuzhiyun {
65*4882a593Smuzhiyun 	struct page *page;
66*4882a593Smuzhiyun 
67*4882a593Smuzhiyun 	if (buffer_dirty(bh))
68*4882a593Smuzhiyun 		goto nope;
69*4882a593Smuzhiyun 	if (atomic_read(&bh->b_count) != 1)
70*4882a593Smuzhiyun 		goto nope;
71*4882a593Smuzhiyun 	page = bh->b_page;
72*4882a593Smuzhiyun 	if (!page)
73*4882a593Smuzhiyun 		goto nope;
74*4882a593Smuzhiyun 	if (page->mapping)
75*4882a593Smuzhiyun 		goto nope;
76*4882a593Smuzhiyun 
77*4882a593Smuzhiyun 	/* OK, it's a truncated page */
78*4882a593Smuzhiyun 	if (!trylock_page(page))
79*4882a593Smuzhiyun 		goto nope;
80*4882a593Smuzhiyun 
81*4882a593Smuzhiyun 	get_page(page);
82*4882a593Smuzhiyun 	__brelse(bh);
83*4882a593Smuzhiyun 	try_to_free_buffers(page);
84*4882a593Smuzhiyun 	unlock_page(page);
85*4882a593Smuzhiyun 	put_page(page);
86*4882a593Smuzhiyun 	return;
87*4882a593Smuzhiyun 
88*4882a593Smuzhiyun nope:
89*4882a593Smuzhiyun 	__brelse(bh);
90*4882a593Smuzhiyun }
91*4882a593Smuzhiyun 
jbd2_commit_block_csum_set(journal_t * j,struct buffer_head * bh)92*4882a593Smuzhiyun static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
93*4882a593Smuzhiyun {
94*4882a593Smuzhiyun 	struct commit_header *h;
95*4882a593Smuzhiyun 	__u32 csum;
96*4882a593Smuzhiyun 
97*4882a593Smuzhiyun 	if (!jbd2_journal_has_csum_v2or3(j))
98*4882a593Smuzhiyun 		return;
99*4882a593Smuzhiyun 
100*4882a593Smuzhiyun 	h = (struct commit_header *)(bh->b_data);
101*4882a593Smuzhiyun 	h->h_chksum_type = 0;
102*4882a593Smuzhiyun 	h->h_chksum_size = 0;
103*4882a593Smuzhiyun 	h->h_chksum[0] = 0;
104*4882a593Smuzhiyun 	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
105*4882a593Smuzhiyun 	h->h_chksum[0] = cpu_to_be32(csum);
106*4882a593Smuzhiyun }
107*4882a593Smuzhiyun 
108*4882a593Smuzhiyun /*
109*4882a593Smuzhiyun  * Done it all: now submit the commit record.  We should have
110*4882a593Smuzhiyun  * cleaned up our previous buffers by now, so if we are in abort
111*4882a593Smuzhiyun  * mode we can now just skip the rest of the journal write
112*4882a593Smuzhiyun  * entirely.
113*4882a593Smuzhiyun  *
114*4882a593Smuzhiyun  * Returns 1 if the journal needs to be aborted or 0 on success
115*4882a593Smuzhiyun  */
journal_submit_commit_record(journal_t * journal,transaction_t * commit_transaction,struct buffer_head ** cbh,__u32 crc32_sum)116*4882a593Smuzhiyun static int journal_submit_commit_record(journal_t *journal,
117*4882a593Smuzhiyun 					transaction_t *commit_transaction,
118*4882a593Smuzhiyun 					struct buffer_head **cbh,
119*4882a593Smuzhiyun 					__u32 crc32_sum)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun 	struct commit_header *tmp;
122*4882a593Smuzhiyun 	struct buffer_head *bh;
123*4882a593Smuzhiyun 	int ret;
124*4882a593Smuzhiyun 	struct timespec64 now;
125*4882a593Smuzhiyun 
126*4882a593Smuzhiyun 	*cbh = NULL;
127*4882a593Smuzhiyun 
128*4882a593Smuzhiyun 	if (is_journal_aborted(journal))
129*4882a593Smuzhiyun 		return 0;
130*4882a593Smuzhiyun 
131*4882a593Smuzhiyun 	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
132*4882a593Smuzhiyun 						JBD2_COMMIT_BLOCK);
133*4882a593Smuzhiyun 	if (!bh)
134*4882a593Smuzhiyun 		return 1;
135*4882a593Smuzhiyun 
136*4882a593Smuzhiyun 	tmp = (struct commit_header *)bh->b_data;
137*4882a593Smuzhiyun 	ktime_get_coarse_real_ts64(&now);
138*4882a593Smuzhiyun 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
139*4882a593Smuzhiyun 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
140*4882a593Smuzhiyun 
141*4882a593Smuzhiyun 	if (jbd2_has_feature_checksum(journal)) {
142*4882a593Smuzhiyun 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
143*4882a593Smuzhiyun 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
144*4882a593Smuzhiyun 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
145*4882a593Smuzhiyun 	}
146*4882a593Smuzhiyun 	jbd2_commit_block_csum_set(journal, bh);
147*4882a593Smuzhiyun 
148*4882a593Smuzhiyun 	BUFFER_TRACE(bh, "submit commit block");
149*4882a593Smuzhiyun 	lock_buffer(bh);
150*4882a593Smuzhiyun 	clear_buffer_dirty(bh);
151*4882a593Smuzhiyun 	set_buffer_uptodate(bh);
152*4882a593Smuzhiyun 	bh->b_end_io = journal_end_buffer_io_sync;
153*4882a593Smuzhiyun 
154*4882a593Smuzhiyun 	if (journal->j_flags & JBD2_BARRIER &&
155*4882a593Smuzhiyun 	    !jbd2_has_feature_async_commit(journal))
156*4882a593Smuzhiyun 		ret = submit_bh(REQ_OP_WRITE,
157*4882a593Smuzhiyun 			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
158*4882a593Smuzhiyun 	else
159*4882a593Smuzhiyun 		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
160*4882a593Smuzhiyun 
161*4882a593Smuzhiyun 	*cbh = bh;
162*4882a593Smuzhiyun 	return ret;
163*4882a593Smuzhiyun }
164*4882a593Smuzhiyun 
165*4882a593Smuzhiyun /*
166*4882a593Smuzhiyun  * This function along with journal_submit_commit_record
167*4882a593Smuzhiyun  * allows to write the commit record asynchronously.
168*4882a593Smuzhiyun  */
journal_wait_on_commit_record(journal_t * journal,struct buffer_head * bh)169*4882a593Smuzhiyun static int journal_wait_on_commit_record(journal_t *journal,
170*4882a593Smuzhiyun 					 struct buffer_head *bh)
171*4882a593Smuzhiyun {
172*4882a593Smuzhiyun 	int ret = 0;
173*4882a593Smuzhiyun 
174*4882a593Smuzhiyun 	clear_buffer_dirty(bh);
175*4882a593Smuzhiyun 	wait_on_buffer(bh);
176*4882a593Smuzhiyun 
177*4882a593Smuzhiyun 	if (unlikely(!buffer_uptodate(bh)))
178*4882a593Smuzhiyun 		ret = -EIO;
179*4882a593Smuzhiyun 	put_bh(bh);            /* One for getblk() */
180*4882a593Smuzhiyun 
181*4882a593Smuzhiyun 	return ret;
182*4882a593Smuzhiyun }
183*4882a593Smuzhiyun 
184*4882a593Smuzhiyun /*
185*4882a593Smuzhiyun  * write the filemap data using writepage() address_space_operations.
186*4882a593Smuzhiyun  * We don't do block allocation here even for delalloc. We don't
187*4882a593Smuzhiyun  * use writepages() because with delayed allocation we may be doing
188*4882a593Smuzhiyun  * block allocation in writepages().
189*4882a593Smuzhiyun  */
jbd2_journal_submit_inode_data_buffers(struct jbd2_inode * jinode)190*4882a593Smuzhiyun int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
191*4882a593Smuzhiyun {
192*4882a593Smuzhiyun 	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
193*4882a593Smuzhiyun 	struct writeback_control wbc = {
194*4882a593Smuzhiyun 		.sync_mode =  WB_SYNC_ALL,
195*4882a593Smuzhiyun 		.nr_to_write = mapping->nrpages * 2,
196*4882a593Smuzhiyun 		.range_start = jinode->i_dirty_start,
197*4882a593Smuzhiyun 		.range_end = jinode->i_dirty_end,
198*4882a593Smuzhiyun 	};
199*4882a593Smuzhiyun 
200*4882a593Smuzhiyun 	/*
201*4882a593Smuzhiyun 	 * submit the inode data buffers. We use writepage
202*4882a593Smuzhiyun 	 * instead of writepages. Because writepages can do
203*4882a593Smuzhiyun 	 * block allocation with delalloc. We need to write
204*4882a593Smuzhiyun 	 * only allocated blocks here.
205*4882a593Smuzhiyun 	 */
206*4882a593Smuzhiyun 	return generic_writepages(mapping, &wbc);
207*4882a593Smuzhiyun }
208*4882a593Smuzhiyun 
209*4882a593Smuzhiyun /* Send all the data buffers related to an inode */
jbd2_submit_inode_data(struct jbd2_inode * jinode)210*4882a593Smuzhiyun int jbd2_submit_inode_data(struct jbd2_inode *jinode)
211*4882a593Smuzhiyun {
212*4882a593Smuzhiyun 
213*4882a593Smuzhiyun 	if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
214*4882a593Smuzhiyun 		return 0;
215*4882a593Smuzhiyun 
216*4882a593Smuzhiyun 	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
217*4882a593Smuzhiyun 	return jbd2_journal_submit_inode_data_buffers(jinode);
218*4882a593Smuzhiyun 
219*4882a593Smuzhiyun }
220*4882a593Smuzhiyun EXPORT_SYMBOL(jbd2_submit_inode_data);
221*4882a593Smuzhiyun 
jbd2_wait_inode_data(journal_t * journal,struct jbd2_inode * jinode)222*4882a593Smuzhiyun int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
223*4882a593Smuzhiyun {
224*4882a593Smuzhiyun 	if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
225*4882a593Smuzhiyun 		!jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
226*4882a593Smuzhiyun 		return 0;
227*4882a593Smuzhiyun 	return filemap_fdatawait_range_keep_errors(
228*4882a593Smuzhiyun 		jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
229*4882a593Smuzhiyun 		jinode->i_dirty_end);
230*4882a593Smuzhiyun }
231*4882a593Smuzhiyun EXPORT_SYMBOL(jbd2_wait_inode_data);
232*4882a593Smuzhiyun 
233*4882a593Smuzhiyun /*
234*4882a593Smuzhiyun  * Submit all the data buffers of inode associated with the transaction to
235*4882a593Smuzhiyun  * disk.
236*4882a593Smuzhiyun  *
237*4882a593Smuzhiyun  * We are in a committing transaction. Therefore no new inode can be added to
238*4882a593Smuzhiyun  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
239*4882a593Smuzhiyun  * operate on from being released while we write out pages.
240*4882a593Smuzhiyun  */
journal_submit_data_buffers(journal_t * journal,transaction_t * commit_transaction)241*4882a593Smuzhiyun static int journal_submit_data_buffers(journal_t *journal,
242*4882a593Smuzhiyun 		transaction_t *commit_transaction)
243*4882a593Smuzhiyun {
244*4882a593Smuzhiyun 	struct jbd2_inode *jinode;
245*4882a593Smuzhiyun 	int err, ret = 0;
246*4882a593Smuzhiyun 
247*4882a593Smuzhiyun 	spin_lock(&journal->j_list_lock);
248*4882a593Smuzhiyun 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
249*4882a593Smuzhiyun 		if (!(jinode->i_flags & JI_WRITE_DATA))
250*4882a593Smuzhiyun 			continue;
251*4882a593Smuzhiyun 		jinode->i_flags |= JI_COMMIT_RUNNING;
252*4882a593Smuzhiyun 		spin_unlock(&journal->j_list_lock);
253*4882a593Smuzhiyun 		/* submit the inode data buffers. */
254*4882a593Smuzhiyun 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
255*4882a593Smuzhiyun 		if (journal->j_submit_inode_data_buffers) {
256*4882a593Smuzhiyun 			err = journal->j_submit_inode_data_buffers(jinode);
257*4882a593Smuzhiyun 			if (!ret)
258*4882a593Smuzhiyun 				ret = err;
259*4882a593Smuzhiyun 		}
260*4882a593Smuzhiyun 		spin_lock(&journal->j_list_lock);
261*4882a593Smuzhiyun 		J_ASSERT(jinode->i_transaction == commit_transaction);
262*4882a593Smuzhiyun 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
263*4882a593Smuzhiyun 		smp_mb();
264*4882a593Smuzhiyun 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
265*4882a593Smuzhiyun 	}
266*4882a593Smuzhiyun 	spin_unlock(&journal->j_list_lock);
267*4882a593Smuzhiyun 	return ret;
268*4882a593Smuzhiyun }
269*4882a593Smuzhiyun 
jbd2_journal_finish_inode_data_buffers(struct jbd2_inode * jinode)270*4882a593Smuzhiyun int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
271*4882a593Smuzhiyun {
272*4882a593Smuzhiyun 	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
273*4882a593Smuzhiyun 
274*4882a593Smuzhiyun 	return filemap_fdatawait_range_keep_errors(mapping,
275*4882a593Smuzhiyun 						   jinode->i_dirty_start,
276*4882a593Smuzhiyun 						   jinode->i_dirty_end);
277*4882a593Smuzhiyun }
278*4882a593Smuzhiyun 
279*4882a593Smuzhiyun /*
280*4882a593Smuzhiyun  * Wait for data submitted for writeout, refile inodes to proper
281*4882a593Smuzhiyun  * transaction if needed.
282*4882a593Smuzhiyun  *
283*4882a593Smuzhiyun  */
journal_finish_inode_data_buffers(journal_t * journal,transaction_t * commit_transaction)284*4882a593Smuzhiyun static int journal_finish_inode_data_buffers(journal_t *journal,
285*4882a593Smuzhiyun 		transaction_t *commit_transaction)
286*4882a593Smuzhiyun {
287*4882a593Smuzhiyun 	struct jbd2_inode *jinode, *next_i;
288*4882a593Smuzhiyun 	int err, ret = 0;
289*4882a593Smuzhiyun 
290*4882a593Smuzhiyun 	/* For locking, see the comment in journal_submit_data_buffers() */
291*4882a593Smuzhiyun 	spin_lock(&journal->j_list_lock);
292*4882a593Smuzhiyun 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
293*4882a593Smuzhiyun 		if (!(jinode->i_flags & JI_WAIT_DATA))
294*4882a593Smuzhiyun 			continue;
295*4882a593Smuzhiyun 		jinode->i_flags |= JI_COMMIT_RUNNING;
296*4882a593Smuzhiyun 		spin_unlock(&journal->j_list_lock);
297*4882a593Smuzhiyun 		/* wait for the inode data buffers writeout. */
298*4882a593Smuzhiyun 		if (journal->j_finish_inode_data_buffers) {
299*4882a593Smuzhiyun 			err = journal->j_finish_inode_data_buffers(jinode);
300*4882a593Smuzhiyun 			if (!ret)
301*4882a593Smuzhiyun 				ret = err;
302*4882a593Smuzhiyun 		}
303*4882a593Smuzhiyun 		spin_lock(&journal->j_list_lock);
304*4882a593Smuzhiyun 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
305*4882a593Smuzhiyun 		smp_mb();
306*4882a593Smuzhiyun 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
307*4882a593Smuzhiyun 	}
308*4882a593Smuzhiyun 
309*4882a593Smuzhiyun 	/* Now refile inode to proper lists */
310*4882a593Smuzhiyun 	list_for_each_entry_safe(jinode, next_i,
311*4882a593Smuzhiyun 				 &commit_transaction->t_inode_list, i_list) {
312*4882a593Smuzhiyun 		list_del(&jinode->i_list);
313*4882a593Smuzhiyun 		if (jinode->i_next_transaction) {
314*4882a593Smuzhiyun 			jinode->i_transaction = jinode->i_next_transaction;
315*4882a593Smuzhiyun 			jinode->i_next_transaction = NULL;
316*4882a593Smuzhiyun 			list_add(&jinode->i_list,
317*4882a593Smuzhiyun 				&jinode->i_transaction->t_inode_list);
318*4882a593Smuzhiyun 		} else {
319*4882a593Smuzhiyun 			jinode->i_transaction = NULL;
320*4882a593Smuzhiyun 			jinode->i_dirty_start = 0;
321*4882a593Smuzhiyun 			jinode->i_dirty_end = 0;
322*4882a593Smuzhiyun 		}
323*4882a593Smuzhiyun 	}
324*4882a593Smuzhiyun 	spin_unlock(&journal->j_list_lock);
325*4882a593Smuzhiyun 
326*4882a593Smuzhiyun 	return ret;
327*4882a593Smuzhiyun }
328*4882a593Smuzhiyun 
jbd2_checksum_data(__u32 crc32_sum,struct buffer_head * bh)329*4882a593Smuzhiyun static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
330*4882a593Smuzhiyun {
331*4882a593Smuzhiyun 	struct page *page = bh->b_page;
332*4882a593Smuzhiyun 	char *addr;
333*4882a593Smuzhiyun 	__u32 checksum;
334*4882a593Smuzhiyun 
335*4882a593Smuzhiyun 	addr = kmap_atomic(page);
336*4882a593Smuzhiyun 	checksum = crc32_be(crc32_sum,
337*4882a593Smuzhiyun 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
338*4882a593Smuzhiyun 	kunmap_atomic(addr);
339*4882a593Smuzhiyun 
340*4882a593Smuzhiyun 	return checksum;
341*4882a593Smuzhiyun }
342*4882a593Smuzhiyun 
write_tag_block(journal_t * j,journal_block_tag_t * tag,unsigned long long block)343*4882a593Smuzhiyun static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
344*4882a593Smuzhiyun 				   unsigned long long block)
345*4882a593Smuzhiyun {
346*4882a593Smuzhiyun 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
347*4882a593Smuzhiyun 	if (jbd2_has_feature_64bit(j))
348*4882a593Smuzhiyun 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
349*4882a593Smuzhiyun }
350*4882a593Smuzhiyun 
jbd2_block_tag_csum_set(journal_t * j,journal_block_tag_t * tag,struct buffer_head * bh,__u32 sequence)351*4882a593Smuzhiyun static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
352*4882a593Smuzhiyun 				    struct buffer_head *bh, __u32 sequence)
353*4882a593Smuzhiyun {
354*4882a593Smuzhiyun 	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
355*4882a593Smuzhiyun 	struct page *page = bh->b_page;
356*4882a593Smuzhiyun 	__u8 *addr;
357*4882a593Smuzhiyun 	__u32 csum32;
358*4882a593Smuzhiyun 	__be32 seq;
359*4882a593Smuzhiyun 
360*4882a593Smuzhiyun 	if (!jbd2_journal_has_csum_v2or3(j))
361*4882a593Smuzhiyun 		return;
362*4882a593Smuzhiyun 
363*4882a593Smuzhiyun 	seq = cpu_to_be32(sequence);
364*4882a593Smuzhiyun 	addr = kmap_atomic(page);
365*4882a593Smuzhiyun 	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
366*4882a593Smuzhiyun 	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
367*4882a593Smuzhiyun 			     bh->b_size);
368*4882a593Smuzhiyun 	kunmap_atomic(addr);
369*4882a593Smuzhiyun 
370*4882a593Smuzhiyun 	if (jbd2_has_feature_csum3(j))
371*4882a593Smuzhiyun 		tag3->t_checksum = cpu_to_be32(csum32);
372*4882a593Smuzhiyun 	else
373*4882a593Smuzhiyun 		tag->t_checksum = cpu_to_be16(csum32);
374*4882a593Smuzhiyun }
375*4882a593Smuzhiyun /*
376*4882a593Smuzhiyun  * jbd2_journal_commit_transaction
377*4882a593Smuzhiyun  *
378*4882a593Smuzhiyun  * The primary function for committing a transaction to the log.  This
379*4882a593Smuzhiyun  * function is called by the journal thread to begin a complete commit.
380*4882a593Smuzhiyun  */
jbd2_journal_commit_transaction(journal_t * journal)381*4882a593Smuzhiyun void jbd2_journal_commit_transaction(journal_t *journal)
382*4882a593Smuzhiyun {
383*4882a593Smuzhiyun 	struct transaction_stats_s stats;
384*4882a593Smuzhiyun 	transaction_t *commit_transaction;
385*4882a593Smuzhiyun 	struct journal_head *jh;
386*4882a593Smuzhiyun 	struct buffer_head *descriptor;
387*4882a593Smuzhiyun 	struct buffer_head **wbuf = journal->j_wbuf;
388*4882a593Smuzhiyun 	int bufs;
389*4882a593Smuzhiyun 	int flags;
390*4882a593Smuzhiyun 	int err;
391*4882a593Smuzhiyun 	unsigned long long blocknr;
392*4882a593Smuzhiyun 	ktime_t start_time;
393*4882a593Smuzhiyun 	u64 commit_time;
394*4882a593Smuzhiyun 	char *tagp = NULL;
395*4882a593Smuzhiyun 	journal_block_tag_t *tag = NULL;
396*4882a593Smuzhiyun 	int space_left = 0;
397*4882a593Smuzhiyun 	int first_tag = 0;
398*4882a593Smuzhiyun 	int tag_flag;
399*4882a593Smuzhiyun 	int i;
400*4882a593Smuzhiyun 	int tag_bytes = journal_tag_bytes(journal);
401*4882a593Smuzhiyun 	struct buffer_head *cbh = NULL; /* For transactional checksums */
402*4882a593Smuzhiyun 	__u32 crc32_sum = ~0;
403*4882a593Smuzhiyun 	struct blk_plug plug;
404*4882a593Smuzhiyun 	/* Tail of the journal */
405*4882a593Smuzhiyun 	unsigned long first_block;
406*4882a593Smuzhiyun 	tid_t first_tid;
407*4882a593Smuzhiyun 	int update_tail;
408*4882a593Smuzhiyun 	int csum_size = 0;
409*4882a593Smuzhiyun 	LIST_HEAD(io_bufs);
410*4882a593Smuzhiyun 	LIST_HEAD(log_bufs);
411*4882a593Smuzhiyun 
412*4882a593Smuzhiyun 	if (jbd2_journal_has_csum_v2or3(journal))
413*4882a593Smuzhiyun 		csum_size = sizeof(struct jbd2_journal_block_tail);
414*4882a593Smuzhiyun 
415*4882a593Smuzhiyun 	/*
416*4882a593Smuzhiyun 	 * First job: lock down the current transaction and wait for
417*4882a593Smuzhiyun 	 * all outstanding updates to complete.
418*4882a593Smuzhiyun 	 */
419*4882a593Smuzhiyun 
420*4882a593Smuzhiyun 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
421*4882a593Smuzhiyun 	if (journal->j_flags & JBD2_FLUSHED) {
422*4882a593Smuzhiyun 		jbd_debug(3, "super block updated\n");
423*4882a593Smuzhiyun 		mutex_lock_io(&journal->j_checkpoint_mutex);
424*4882a593Smuzhiyun 		/*
425*4882a593Smuzhiyun 		 * We hold j_checkpoint_mutex so tail cannot change under us.
426*4882a593Smuzhiyun 		 * We don't need any special data guarantees for writing sb
427*4882a593Smuzhiyun 		 * since journal is empty and it is ok for write to be
428*4882a593Smuzhiyun 		 * flushed only with transaction commit.
429*4882a593Smuzhiyun 		 */
430*4882a593Smuzhiyun 		jbd2_journal_update_sb_log_tail(journal,
431*4882a593Smuzhiyun 						journal->j_tail_sequence,
432*4882a593Smuzhiyun 						journal->j_tail,
433*4882a593Smuzhiyun 						REQ_SYNC);
434*4882a593Smuzhiyun 		mutex_unlock(&journal->j_checkpoint_mutex);
435*4882a593Smuzhiyun 	} else {
436*4882a593Smuzhiyun 		jbd_debug(3, "superblock not updated\n");
437*4882a593Smuzhiyun 	}
438*4882a593Smuzhiyun 
439*4882a593Smuzhiyun 	J_ASSERT(journal->j_running_transaction != NULL);
440*4882a593Smuzhiyun 	J_ASSERT(journal->j_committing_transaction == NULL);
441*4882a593Smuzhiyun 
442*4882a593Smuzhiyun 	write_lock(&journal->j_state_lock);
443*4882a593Smuzhiyun 	journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
444*4882a593Smuzhiyun 	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
445*4882a593Smuzhiyun 		DEFINE_WAIT(wait);
446*4882a593Smuzhiyun 
447*4882a593Smuzhiyun 		prepare_to_wait(&journal->j_fc_wait, &wait,
448*4882a593Smuzhiyun 				TASK_UNINTERRUPTIBLE);
449*4882a593Smuzhiyun 		write_unlock(&journal->j_state_lock);
450*4882a593Smuzhiyun 		schedule();
451*4882a593Smuzhiyun 		write_lock(&journal->j_state_lock);
452*4882a593Smuzhiyun 		finish_wait(&journal->j_fc_wait, &wait);
453*4882a593Smuzhiyun 		/*
454*4882a593Smuzhiyun 		 * TODO: by blocking fast commits here, we are increasing
455*4882a593Smuzhiyun 		 * fsync() latency slightly. Strictly speaking, we don't need
456*4882a593Smuzhiyun 		 * to block fast commits until the transaction enters T_FLUSH
457*4882a593Smuzhiyun 		 * state. So an optimization is possible where we block new fast
458*4882a593Smuzhiyun 		 * commits here and wait for existing ones to complete
459*4882a593Smuzhiyun 		 * just before we enter T_FLUSH. That way, the existing fast
460*4882a593Smuzhiyun 		 * commits and this full commit can proceed parallely.
461*4882a593Smuzhiyun 		 */
462*4882a593Smuzhiyun 	}
463*4882a593Smuzhiyun 	write_unlock(&journal->j_state_lock);
464*4882a593Smuzhiyun 
465*4882a593Smuzhiyun 	commit_transaction = journal->j_running_transaction;
466*4882a593Smuzhiyun 
467*4882a593Smuzhiyun 	trace_jbd2_start_commit(journal, commit_transaction);
468*4882a593Smuzhiyun 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
469*4882a593Smuzhiyun 			commit_transaction->t_tid);
470*4882a593Smuzhiyun 
471*4882a593Smuzhiyun 	write_lock(&journal->j_state_lock);
472*4882a593Smuzhiyun 	journal->j_fc_off = 0;
473*4882a593Smuzhiyun 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
474*4882a593Smuzhiyun 	commit_transaction->t_state = T_LOCKED;
475*4882a593Smuzhiyun 
476*4882a593Smuzhiyun 	trace_jbd2_commit_locking(journal, commit_transaction);
477*4882a593Smuzhiyun 	stats.run.rs_wait = commit_transaction->t_max_wait;
478*4882a593Smuzhiyun 	stats.run.rs_request_delay = 0;
479*4882a593Smuzhiyun 	stats.run.rs_locked = jiffies;
480*4882a593Smuzhiyun 	if (commit_transaction->t_requested)
481*4882a593Smuzhiyun 		stats.run.rs_request_delay =
482*4882a593Smuzhiyun 			jbd2_time_diff(commit_transaction->t_requested,
483*4882a593Smuzhiyun 				       stats.run.rs_locked);
484*4882a593Smuzhiyun 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
485*4882a593Smuzhiyun 					      stats.run.rs_locked);
486*4882a593Smuzhiyun 
487*4882a593Smuzhiyun 	spin_lock(&commit_transaction->t_handle_lock);
488*4882a593Smuzhiyun 	while (atomic_read(&commit_transaction->t_updates)) {
489*4882a593Smuzhiyun 		DEFINE_WAIT(wait);
490*4882a593Smuzhiyun 
491*4882a593Smuzhiyun 		prepare_to_wait(&journal->j_wait_updates, &wait,
492*4882a593Smuzhiyun 					TASK_UNINTERRUPTIBLE);
493*4882a593Smuzhiyun 		if (atomic_read(&commit_transaction->t_updates)) {
494*4882a593Smuzhiyun 			spin_unlock(&commit_transaction->t_handle_lock);
495*4882a593Smuzhiyun 			write_unlock(&journal->j_state_lock);
496*4882a593Smuzhiyun 			schedule();
497*4882a593Smuzhiyun 			write_lock(&journal->j_state_lock);
498*4882a593Smuzhiyun 			spin_lock(&commit_transaction->t_handle_lock);
499*4882a593Smuzhiyun 		}
500*4882a593Smuzhiyun 		finish_wait(&journal->j_wait_updates, &wait);
501*4882a593Smuzhiyun 	}
502*4882a593Smuzhiyun 	spin_unlock(&commit_transaction->t_handle_lock);
503*4882a593Smuzhiyun 	commit_transaction->t_state = T_SWITCH;
504*4882a593Smuzhiyun 
505*4882a593Smuzhiyun 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
506*4882a593Smuzhiyun 			journal->j_max_transaction_buffers);
507*4882a593Smuzhiyun 
508*4882a593Smuzhiyun 	/*
509*4882a593Smuzhiyun 	 * First thing we are allowed to do is to discard any remaining
510*4882a593Smuzhiyun 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
511*4882a593Smuzhiyun 	 * that there are no such buffers: if a large filesystem
512*4882a593Smuzhiyun 	 * operation like a truncate needs to split itself over multiple
513*4882a593Smuzhiyun 	 * transactions, then it may try to do a jbd2_journal_restart() while
514*4882a593Smuzhiyun 	 * there are still BJ_Reserved buffers outstanding.  These must
515*4882a593Smuzhiyun 	 * be released cleanly from the current transaction.
516*4882a593Smuzhiyun 	 *
517*4882a593Smuzhiyun 	 * In this case, the filesystem must still reserve write access
518*4882a593Smuzhiyun 	 * again before modifying the buffer in the new transaction, but
519*4882a593Smuzhiyun 	 * we do not require it to remember exactly which old buffers it
520*4882a593Smuzhiyun 	 * has reserved.  This is consistent with the existing behaviour
521*4882a593Smuzhiyun 	 * that multiple jbd2_journal_get_write_access() calls to the same
522*4882a593Smuzhiyun 	 * buffer are perfectly permissible.
523*4882a593Smuzhiyun 	 * We use journal->j_state_lock here to serialize processing of
524*4882a593Smuzhiyun 	 * t_reserved_list with eviction of buffers from journal_unmap_buffer().
525*4882a593Smuzhiyun 	 */
526*4882a593Smuzhiyun 	while (commit_transaction->t_reserved_list) {
527*4882a593Smuzhiyun 		jh = commit_transaction->t_reserved_list;
528*4882a593Smuzhiyun 		JBUFFER_TRACE(jh, "reserved, unused: refile");
529*4882a593Smuzhiyun 		/*
530*4882a593Smuzhiyun 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
531*4882a593Smuzhiyun 		 * leave undo-committed data.
532*4882a593Smuzhiyun 		 */
533*4882a593Smuzhiyun 		if (jh->b_committed_data) {
534*4882a593Smuzhiyun 			struct buffer_head *bh = jh2bh(jh);
535*4882a593Smuzhiyun 
536*4882a593Smuzhiyun 			spin_lock(&jh->b_state_lock);
537*4882a593Smuzhiyun 			jbd2_free(jh->b_committed_data, bh->b_size);
538*4882a593Smuzhiyun 			jh->b_committed_data = NULL;
539*4882a593Smuzhiyun 			spin_unlock(&jh->b_state_lock);
540*4882a593Smuzhiyun 		}
541*4882a593Smuzhiyun 		jbd2_journal_refile_buffer(journal, jh);
542*4882a593Smuzhiyun 	}
543*4882a593Smuzhiyun 
544*4882a593Smuzhiyun 	write_unlock(&journal->j_state_lock);
545*4882a593Smuzhiyun 	/*
546*4882a593Smuzhiyun 	 * Now try to drop any written-back buffers from the journal's
547*4882a593Smuzhiyun 	 * checkpoint lists.  We do this *before* commit because it potentially
548*4882a593Smuzhiyun 	 * frees some memory
549*4882a593Smuzhiyun 	 */
550*4882a593Smuzhiyun 	spin_lock(&journal->j_list_lock);
551*4882a593Smuzhiyun 	__jbd2_journal_clean_checkpoint_list(journal, false);
552*4882a593Smuzhiyun 	spin_unlock(&journal->j_list_lock);
553*4882a593Smuzhiyun 
554*4882a593Smuzhiyun 	jbd_debug(3, "JBD2: commit phase 1\n");
555*4882a593Smuzhiyun 
556*4882a593Smuzhiyun 	/*
557*4882a593Smuzhiyun 	 * Clear revoked flag to reflect there is no revoked buffers
558*4882a593Smuzhiyun 	 * in the next transaction which is going to be started.
559*4882a593Smuzhiyun 	 */
560*4882a593Smuzhiyun 	jbd2_clear_buffer_revoked_flags(journal);
561*4882a593Smuzhiyun 
562*4882a593Smuzhiyun 	/*
563*4882a593Smuzhiyun 	 * Switch to a new revoke table.
564*4882a593Smuzhiyun 	 */
565*4882a593Smuzhiyun 	jbd2_journal_switch_revoke_table(journal);
566*4882a593Smuzhiyun 
567*4882a593Smuzhiyun 	write_lock(&journal->j_state_lock);
568*4882a593Smuzhiyun 	/*
569*4882a593Smuzhiyun 	 * Reserved credits cannot be claimed anymore, free them
570*4882a593Smuzhiyun 	 */
571*4882a593Smuzhiyun 	atomic_sub(atomic_read(&journal->j_reserved_credits),
572*4882a593Smuzhiyun 		   &commit_transaction->t_outstanding_credits);
573*4882a593Smuzhiyun 
574*4882a593Smuzhiyun 	trace_jbd2_commit_flushing(journal, commit_transaction);
575*4882a593Smuzhiyun 	stats.run.rs_flushing = jiffies;
576*4882a593Smuzhiyun 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
577*4882a593Smuzhiyun 					     stats.run.rs_flushing);
578*4882a593Smuzhiyun 
579*4882a593Smuzhiyun 	commit_transaction->t_state = T_FLUSH;
580*4882a593Smuzhiyun 	journal->j_committing_transaction = commit_transaction;
581*4882a593Smuzhiyun 	journal->j_running_transaction = NULL;
582*4882a593Smuzhiyun 	start_time = ktime_get();
583*4882a593Smuzhiyun 	commit_transaction->t_log_start = journal->j_head;
584*4882a593Smuzhiyun 	wake_up_all(&journal->j_wait_transaction_locked);
585*4882a593Smuzhiyun 	write_unlock(&journal->j_state_lock);
586*4882a593Smuzhiyun 
587*4882a593Smuzhiyun 	jbd_debug(3, "JBD2: commit phase 2a\n");
588*4882a593Smuzhiyun 
589*4882a593Smuzhiyun 	/*
590*4882a593Smuzhiyun 	 * Now start flushing things to disk, in the order they appear
591*4882a593Smuzhiyun 	 * on the transaction lists.  Data blocks go first.
592*4882a593Smuzhiyun 	 */
593*4882a593Smuzhiyun 	err = journal_submit_data_buffers(journal, commit_transaction);
594*4882a593Smuzhiyun 	if (err)
595*4882a593Smuzhiyun 		jbd2_journal_abort(journal, err);
596*4882a593Smuzhiyun 
597*4882a593Smuzhiyun 	blk_start_plug(&plug);
598*4882a593Smuzhiyun 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
599*4882a593Smuzhiyun 
600*4882a593Smuzhiyun 	jbd_debug(3, "JBD2: commit phase 2b\n");
601*4882a593Smuzhiyun 
602*4882a593Smuzhiyun 	/*
603*4882a593Smuzhiyun 	 * Way to go: we have now written out all of the data for a
604*4882a593Smuzhiyun 	 * transaction!  Now comes the tricky part: we need to write out
605*4882a593Smuzhiyun 	 * metadata.  Loop over the transaction's entire buffer list:
606*4882a593Smuzhiyun 	 */
607*4882a593Smuzhiyun 	write_lock(&journal->j_state_lock);
608*4882a593Smuzhiyun 	commit_transaction->t_state = T_COMMIT;
609*4882a593Smuzhiyun 	write_unlock(&journal->j_state_lock);
610*4882a593Smuzhiyun 
611*4882a593Smuzhiyun 	trace_jbd2_commit_logging(journal, commit_transaction);
612*4882a593Smuzhiyun 	stats.run.rs_logging = jiffies;
613*4882a593Smuzhiyun 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
614*4882a593Smuzhiyun 					       stats.run.rs_logging);
615*4882a593Smuzhiyun 	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
616*4882a593Smuzhiyun 	stats.run.rs_blocks_logged = 0;
617*4882a593Smuzhiyun 
618*4882a593Smuzhiyun 	J_ASSERT(commit_transaction->t_nr_buffers <=
619*4882a593Smuzhiyun 		 atomic_read(&commit_transaction->t_outstanding_credits));
620*4882a593Smuzhiyun 
621*4882a593Smuzhiyun 	err = 0;
622*4882a593Smuzhiyun 	bufs = 0;
623*4882a593Smuzhiyun 	descriptor = NULL;
624*4882a593Smuzhiyun 	while (commit_transaction->t_buffers) {
625*4882a593Smuzhiyun 
626*4882a593Smuzhiyun 		/* Find the next buffer to be journaled... */
627*4882a593Smuzhiyun 
628*4882a593Smuzhiyun 		jh = commit_transaction->t_buffers;
629*4882a593Smuzhiyun 
630*4882a593Smuzhiyun 		/* If we're in abort mode, we just un-journal the buffer and
631*4882a593Smuzhiyun 		   release it. */
632*4882a593Smuzhiyun 
633*4882a593Smuzhiyun 		if (is_journal_aborted(journal)) {
634*4882a593Smuzhiyun 			clear_buffer_jbddirty(jh2bh(jh));
635*4882a593Smuzhiyun 			JBUFFER_TRACE(jh, "journal is aborting: refile");
636*4882a593Smuzhiyun 			jbd2_buffer_abort_trigger(jh,
637*4882a593Smuzhiyun 						  jh->b_frozen_data ?
638*4882a593Smuzhiyun 						  jh->b_frozen_triggers :
639*4882a593Smuzhiyun 						  jh->b_triggers);
640*4882a593Smuzhiyun 			jbd2_journal_refile_buffer(journal, jh);
641*4882a593Smuzhiyun 			/* If that was the last one, we need to clean up
642*4882a593Smuzhiyun 			 * any descriptor buffers which may have been
643*4882a593Smuzhiyun 			 * already allocated, even if we are now
644*4882a593Smuzhiyun 			 * aborting. */
645*4882a593Smuzhiyun 			if (!commit_transaction->t_buffers)
646*4882a593Smuzhiyun 				goto start_journal_io;
647*4882a593Smuzhiyun 			continue;
648*4882a593Smuzhiyun 		}
649*4882a593Smuzhiyun 
650*4882a593Smuzhiyun 		/* Make sure we have a descriptor block in which to
651*4882a593Smuzhiyun 		   record the metadata buffer. */
652*4882a593Smuzhiyun 
653*4882a593Smuzhiyun 		if (!descriptor) {
654*4882a593Smuzhiyun 			J_ASSERT (bufs == 0);
655*4882a593Smuzhiyun 
656*4882a593Smuzhiyun 			jbd_debug(4, "JBD2: get descriptor\n");
657*4882a593Smuzhiyun 
658*4882a593Smuzhiyun 			descriptor = jbd2_journal_get_descriptor_buffer(
659*4882a593Smuzhiyun 							commit_transaction,
660*4882a593Smuzhiyun 							JBD2_DESCRIPTOR_BLOCK);
661*4882a593Smuzhiyun 			if (!descriptor) {
662*4882a593Smuzhiyun 				jbd2_journal_abort(journal, -EIO);
663*4882a593Smuzhiyun 				continue;
664*4882a593Smuzhiyun 			}
665*4882a593Smuzhiyun 
666*4882a593Smuzhiyun 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
667*4882a593Smuzhiyun 				(unsigned long long)descriptor->b_blocknr,
668*4882a593Smuzhiyun 				descriptor->b_data);
669*4882a593Smuzhiyun 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
670*4882a593Smuzhiyun 			space_left = descriptor->b_size -
671*4882a593Smuzhiyun 						sizeof(journal_header_t);
672*4882a593Smuzhiyun 			first_tag = 1;
673*4882a593Smuzhiyun 			set_buffer_jwrite(descriptor);
674*4882a593Smuzhiyun 			set_buffer_dirty(descriptor);
675*4882a593Smuzhiyun 			wbuf[bufs++] = descriptor;
676*4882a593Smuzhiyun 
677*4882a593Smuzhiyun 			/* Record it so that we can wait for IO
678*4882a593Smuzhiyun                            completion later */
679*4882a593Smuzhiyun 			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
680*4882a593Smuzhiyun 			jbd2_file_log_bh(&log_bufs, descriptor);
681*4882a593Smuzhiyun 		}
682*4882a593Smuzhiyun 
683*4882a593Smuzhiyun 		/* Where is the buffer to be written? */
684*4882a593Smuzhiyun 
685*4882a593Smuzhiyun 		err = jbd2_journal_next_log_block(journal, &blocknr);
686*4882a593Smuzhiyun 		/* If the block mapping failed, just abandon the buffer
687*4882a593Smuzhiyun 		   and repeat this loop: we'll fall into the
688*4882a593Smuzhiyun 		   refile-on-abort condition above. */
689*4882a593Smuzhiyun 		if (err) {
690*4882a593Smuzhiyun 			jbd2_journal_abort(journal, err);
691*4882a593Smuzhiyun 			continue;
692*4882a593Smuzhiyun 		}
693*4882a593Smuzhiyun 
694*4882a593Smuzhiyun 		/*
695*4882a593Smuzhiyun 		 * start_this_handle() uses t_outstanding_credits to determine
696*4882a593Smuzhiyun 		 * the free space in the log.
697*4882a593Smuzhiyun 		 */
698*4882a593Smuzhiyun 		atomic_dec(&commit_transaction->t_outstanding_credits);
699*4882a593Smuzhiyun 
700*4882a593Smuzhiyun 		/* Bump b_count to prevent truncate from stumbling over
701*4882a593Smuzhiyun                    the shadowed buffer!  @@@ This can go if we ever get
702*4882a593Smuzhiyun                    rid of the shadow pairing of buffers. */
703*4882a593Smuzhiyun 		atomic_inc(&jh2bh(jh)->b_count);
704*4882a593Smuzhiyun 
705*4882a593Smuzhiyun 		/*
706*4882a593Smuzhiyun 		 * Make a temporary IO buffer with which to write it out
707*4882a593Smuzhiyun 		 * (this will requeue the metadata buffer to BJ_Shadow).
708*4882a593Smuzhiyun 		 */
709*4882a593Smuzhiyun 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
710*4882a593Smuzhiyun 		JBUFFER_TRACE(jh, "ph3: write metadata");
711*4882a593Smuzhiyun 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
712*4882a593Smuzhiyun 						jh, &wbuf[bufs], blocknr);
713*4882a593Smuzhiyun 		if (flags < 0) {
714*4882a593Smuzhiyun 			jbd2_journal_abort(journal, flags);
715*4882a593Smuzhiyun 			continue;
716*4882a593Smuzhiyun 		}
717*4882a593Smuzhiyun 		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
718*4882a593Smuzhiyun 
719*4882a593Smuzhiyun 		/* Record the new block's tag in the current descriptor
720*4882a593Smuzhiyun                    buffer */
721*4882a593Smuzhiyun 
722*4882a593Smuzhiyun 		tag_flag = 0;
723*4882a593Smuzhiyun 		if (flags & 1)
724*4882a593Smuzhiyun 			tag_flag |= JBD2_FLAG_ESCAPE;
725*4882a593Smuzhiyun 		if (!first_tag)
726*4882a593Smuzhiyun 			tag_flag |= JBD2_FLAG_SAME_UUID;
727*4882a593Smuzhiyun 
728*4882a593Smuzhiyun 		tag = (journal_block_tag_t *) tagp;
729*4882a593Smuzhiyun 		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
730*4882a593Smuzhiyun 		tag->t_flags = cpu_to_be16(tag_flag);
731*4882a593Smuzhiyun 		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
732*4882a593Smuzhiyun 					commit_transaction->t_tid);
733*4882a593Smuzhiyun 		tagp += tag_bytes;
734*4882a593Smuzhiyun 		space_left -= tag_bytes;
735*4882a593Smuzhiyun 		bufs++;
736*4882a593Smuzhiyun 
737*4882a593Smuzhiyun 		if (first_tag) {
738*4882a593Smuzhiyun 			memcpy (tagp, journal->j_uuid, 16);
739*4882a593Smuzhiyun 			tagp += 16;
740*4882a593Smuzhiyun 			space_left -= 16;
741*4882a593Smuzhiyun 			first_tag = 0;
742*4882a593Smuzhiyun 		}
743*4882a593Smuzhiyun 
744*4882a593Smuzhiyun 		/* If there's no more to do, or if the descriptor is full,
745*4882a593Smuzhiyun 		   let the IO rip! */
746*4882a593Smuzhiyun 
747*4882a593Smuzhiyun 		if (bufs == journal->j_wbufsize ||
748*4882a593Smuzhiyun 		    commit_transaction->t_buffers == NULL ||
749*4882a593Smuzhiyun 		    space_left < tag_bytes + 16 + csum_size) {
750*4882a593Smuzhiyun 
751*4882a593Smuzhiyun 			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
752*4882a593Smuzhiyun 
753*4882a593Smuzhiyun 			/* Write an end-of-descriptor marker before
754*4882a593Smuzhiyun                            submitting the IOs.  "tag" still points to
755*4882a593Smuzhiyun                            the last tag we set up. */
756*4882a593Smuzhiyun 
757*4882a593Smuzhiyun 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
758*4882a593Smuzhiyun start_journal_io:
759*4882a593Smuzhiyun 			if (descriptor)
760*4882a593Smuzhiyun 				jbd2_descriptor_block_csum_set(journal,
761*4882a593Smuzhiyun 							descriptor);
762*4882a593Smuzhiyun 
763*4882a593Smuzhiyun 			for (i = 0; i < bufs; i++) {
764*4882a593Smuzhiyun 				struct buffer_head *bh = wbuf[i];
765*4882a593Smuzhiyun 				/*
766*4882a593Smuzhiyun 				 * Compute checksum.
767*4882a593Smuzhiyun 				 */
768*4882a593Smuzhiyun 				if (jbd2_has_feature_checksum(journal)) {
769*4882a593Smuzhiyun 					crc32_sum =
770*4882a593Smuzhiyun 					    jbd2_checksum_data(crc32_sum, bh);
771*4882a593Smuzhiyun 				}
772*4882a593Smuzhiyun 
773*4882a593Smuzhiyun 				lock_buffer(bh);
774*4882a593Smuzhiyun 				clear_buffer_dirty(bh);
775*4882a593Smuzhiyun 				set_buffer_uptodate(bh);
776*4882a593Smuzhiyun 				bh->b_end_io = journal_end_buffer_io_sync;
777*4882a593Smuzhiyun 				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
778*4882a593Smuzhiyun 			}
779*4882a593Smuzhiyun 			cond_resched();
780*4882a593Smuzhiyun 
781*4882a593Smuzhiyun 			/* Force a new descriptor to be generated next
782*4882a593Smuzhiyun                            time round the loop. */
783*4882a593Smuzhiyun 			descriptor = NULL;
784*4882a593Smuzhiyun 			bufs = 0;
785*4882a593Smuzhiyun 		}
786*4882a593Smuzhiyun 	}
787*4882a593Smuzhiyun 
788*4882a593Smuzhiyun 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
789*4882a593Smuzhiyun 	if (err) {
790*4882a593Smuzhiyun 		printk(KERN_WARNING
791*4882a593Smuzhiyun 			"JBD2: Detected IO errors while flushing file data "
792*4882a593Smuzhiyun 		       "on %s\n", journal->j_devname);
793*4882a593Smuzhiyun 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
794*4882a593Smuzhiyun 			jbd2_journal_abort(journal, err);
795*4882a593Smuzhiyun 		err = 0;
796*4882a593Smuzhiyun 	}
797*4882a593Smuzhiyun 
798*4882a593Smuzhiyun 	/*
799*4882a593Smuzhiyun 	 * Get current oldest transaction in the log before we issue flush
800*4882a593Smuzhiyun 	 * to the filesystem device. After the flush we can be sure that
801*4882a593Smuzhiyun 	 * blocks of all older transactions are checkpointed to persistent
802*4882a593Smuzhiyun 	 * storage and we will be safe to update journal start in the
803*4882a593Smuzhiyun 	 * superblock with the numbers we get here.
804*4882a593Smuzhiyun 	 */
805*4882a593Smuzhiyun 	update_tail =
806*4882a593Smuzhiyun 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
807*4882a593Smuzhiyun 
808*4882a593Smuzhiyun 	write_lock(&journal->j_state_lock);
809*4882a593Smuzhiyun 	if (update_tail) {
810*4882a593Smuzhiyun 		long freed = first_block - journal->j_tail;
811*4882a593Smuzhiyun 
812*4882a593Smuzhiyun 		if (first_block < journal->j_tail)
813*4882a593Smuzhiyun 			freed += journal->j_last - journal->j_first;
814*4882a593Smuzhiyun 		/* Update tail only if we free significant amount of space */
815*4882a593Smuzhiyun 		if (freed < jbd2_journal_get_max_txn_bufs(journal))
816*4882a593Smuzhiyun 			update_tail = 0;
817*4882a593Smuzhiyun 	}
818*4882a593Smuzhiyun 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
819*4882a593Smuzhiyun 	commit_transaction->t_state = T_COMMIT_DFLUSH;
820*4882a593Smuzhiyun 	write_unlock(&journal->j_state_lock);
821*4882a593Smuzhiyun 
822*4882a593Smuzhiyun 	/*
823*4882a593Smuzhiyun 	 * If the journal is not located on the file system device,
824*4882a593Smuzhiyun 	 * then we must flush the file system device before we issue
825*4882a593Smuzhiyun 	 * the commit record
826*4882a593Smuzhiyun 	 */
827*4882a593Smuzhiyun 	if (commit_transaction->t_need_data_flush &&
828*4882a593Smuzhiyun 	    (journal->j_fs_dev != journal->j_dev) &&
829*4882a593Smuzhiyun 	    (journal->j_flags & JBD2_BARRIER))
830*4882a593Smuzhiyun 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
831*4882a593Smuzhiyun 
832*4882a593Smuzhiyun 	/* Done it all: now write the commit record asynchronously. */
833*4882a593Smuzhiyun 	if (jbd2_has_feature_async_commit(journal)) {
834*4882a593Smuzhiyun 		err = journal_submit_commit_record(journal, commit_transaction,
835*4882a593Smuzhiyun 						 &cbh, crc32_sum);
836*4882a593Smuzhiyun 		if (err)
837*4882a593Smuzhiyun 			jbd2_journal_abort(journal, err);
838*4882a593Smuzhiyun 	}
839*4882a593Smuzhiyun 
840*4882a593Smuzhiyun 	blk_finish_plug(&plug);
841*4882a593Smuzhiyun 
842*4882a593Smuzhiyun 	/* Lo and behold: we have just managed to send a transaction to
843*4882a593Smuzhiyun            the log.  Before we can commit it, wait for the IO so far to
844*4882a593Smuzhiyun            complete.  Control buffers being written are on the
845*4882a593Smuzhiyun            transaction's t_log_list queue, and metadata buffers are on
846*4882a593Smuzhiyun            the io_bufs list.
847*4882a593Smuzhiyun 
848*4882a593Smuzhiyun 	   Wait for the buffers in reverse order.  That way we are
849*4882a593Smuzhiyun 	   less likely to be woken up until all IOs have completed, and
850*4882a593Smuzhiyun 	   so we incur less scheduling load.
851*4882a593Smuzhiyun 	*/
852*4882a593Smuzhiyun 
853*4882a593Smuzhiyun 	jbd_debug(3, "JBD2: commit phase 3\n");
854*4882a593Smuzhiyun 
855*4882a593Smuzhiyun 	while (!list_empty(&io_bufs)) {
856*4882a593Smuzhiyun 		struct buffer_head *bh = list_entry(io_bufs.prev,
857*4882a593Smuzhiyun 						    struct buffer_head,
858*4882a593Smuzhiyun 						    b_assoc_buffers);
859*4882a593Smuzhiyun 
860*4882a593Smuzhiyun 		wait_on_buffer(bh);
861*4882a593Smuzhiyun 		cond_resched();
862*4882a593Smuzhiyun 
863*4882a593Smuzhiyun 		if (unlikely(!buffer_uptodate(bh)))
864*4882a593Smuzhiyun 			err = -EIO;
865*4882a593Smuzhiyun 		jbd2_unfile_log_bh(bh);
866*4882a593Smuzhiyun 		stats.run.rs_blocks_logged++;
867*4882a593Smuzhiyun 
868*4882a593Smuzhiyun 		/*
869*4882a593Smuzhiyun 		 * The list contains temporary buffer heads created by
870*4882a593Smuzhiyun 		 * jbd2_journal_write_metadata_buffer().
871*4882a593Smuzhiyun 		 */
872*4882a593Smuzhiyun 		BUFFER_TRACE(bh, "dumping temporary bh");
873*4882a593Smuzhiyun 		__brelse(bh);
874*4882a593Smuzhiyun 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
875*4882a593Smuzhiyun 		free_buffer_head(bh);
876*4882a593Smuzhiyun 
877*4882a593Smuzhiyun 		/* We also have to refile the corresponding shadowed buffer */
878*4882a593Smuzhiyun 		jh = commit_transaction->t_shadow_list->b_tprev;
879*4882a593Smuzhiyun 		bh = jh2bh(jh);
880*4882a593Smuzhiyun 		clear_buffer_jwrite(bh);
881*4882a593Smuzhiyun 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
882*4882a593Smuzhiyun 		J_ASSERT_BH(bh, !buffer_shadow(bh));
883*4882a593Smuzhiyun 
884*4882a593Smuzhiyun 		/* The metadata is now released for reuse, but we need
885*4882a593Smuzhiyun                    to remember it against this transaction so that when
886*4882a593Smuzhiyun                    we finally commit, we can do any checkpointing
887*4882a593Smuzhiyun                    required. */
888*4882a593Smuzhiyun 		JBUFFER_TRACE(jh, "file as BJ_Forget");
889*4882a593Smuzhiyun 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
890*4882a593Smuzhiyun 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
891*4882a593Smuzhiyun 		__brelse(bh);
892*4882a593Smuzhiyun 	}
893*4882a593Smuzhiyun 
894*4882a593Smuzhiyun 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
895*4882a593Smuzhiyun 
896*4882a593Smuzhiyun 	jbd_debug(3, "JBD2: commit phase 4\n");
897*4882a593Smuzhiyun 
898*4882a593Smuzhiyun 	/* Here we wait for the revoke record and descriptor record buffers */
899*4882a593Smuzhiyun 	while (!list_empty(&log_bufs)) {
900*4882a593Smuzhiyun 		struct buffer_head *bh;
901*4882a593Smuzhiyun 
902*4882a593Smuzhiyun 		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
903*4882a593Smuzhiyun 		wait_on_buffer(bh);
904*4882a593Smuzhiyun 		cond_resched();
905*4882a593Smuzhiyun 
906*4882a593Smuzhiyun 		if (unlikely(!buffer_uptodate(bh)))
907*4882a593Smuzhiyun 			err = -EIO;
908*4882a593Smuzhiyun 
909*4882a593Smuzhiyun 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
910*4882a593Smuzhiyun 		clear_buffer_jwrite(bh);
911*4882a593Smuzhiyun 		jbd2_unfile_log_bh(bh);
912*4882a593Smuzhiyun 		stats.run.rs_blocks_logged++;
913*4882a593Smuzhiyun 		__brelse(bh);		/* One for getblk */
914*4882a593Smuzhiyun 		/* AKPM: bforget here */
915*4882a593Smuzhiyun 	}
916*4882a593Smuzhiyun 
917*4882a593Smuzhiyun 	if (err)
918*4882a593Smuzhiyun 		jbd2_journal_abort(journal, err);
919*4882a593Smuzhiyun 
920*4882a593Smuzhiyun 	jbd_debug(3, "JBD2: commit phase 5\n");
921*4882a593Smuzhiyun 	write_lock(&journal->j_state_lock);
922*4882a593Smuzhiyun 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
923*4882a593Smuzhiyun 	commit_transaction->t_state = T_COMMIT_JFLUSH;
924*4882a593Smuzhiyun 	write_unlock(&journal->j_state_lock);
925*4882a593Smuzhiyun 
926*4882a593Smuzhiyun 	if (!jbd2_has_feature_async_commit(journal)) {
927*4882a593Smuzhiyun 		err = journal_submit_commit_record(journal, commit_transaction,
928*4882a593Smuzhiyun 						&cbh, crc32_sum);
929*4882a593Smuzhiyun 		if (err)
930*4882a593Smuzhiyun 			jbd2_journal_abort(journal, err);
931*4882a593Smuzhiyun 	}
932*4882a593Smuzhiyun 	if (cbh)
933*4882a593Smuzhiyun 		err = journal_wait_on_commit_record(journal, cbh);
934*4882a593Smuzhiyun 	stats.run.rs_blocks_logged++;
935*4882a593Smuzhiyun 	if (jbd2_has_feature_async_commit(journal) &&
936*4882a593Smuzhiyun 	    journal->j_flags & JBD2_BARRIER) {
937*4882a593Smuzhiyun 		blkdev_issue_flush(journal->j_dev, GFP_NOFS);
938*4882a593Smuzhiyun 	}
939*4882a593Smuzhiyun 
940*4882a593Smuzhiyun 	if (err)
941*4882a593Smuzhiyun 		jbd2_journal_abort(journal, err);
942*4882a593Smuzhiyun 
943*4882a593Smuzhiyun 	WARN_ON_ONCE(
944*4882a593Smuzhiyun 		atomic_read(&commit_transaction->t_outstanding_credits) < 0);
945*4882a593Smuzhiyun 
946*4882a593Smuzhiyun 	/*
947*4882a593Smuzhiyun 	 * Now disk caches for filesystem device are flushed so we are safe to
948*4882a593Smuzhiyun 	 * erase checkpointed transactions from the log by updating journal
949*4882a593Smuzhiyun 	 * superblock.
950*4882a593Smuzhiyun 	 */
951*4882a593Smuzhiyun 	if (update_tail)
952*4882a593Smuzhiyun 		jbd2_update_log_tail(journal, first_tid, first_block);
953*4882a593Smuzhiyun 
954*4882a593Smuzhiyun 	/* End of a transaction!  Finally, we can do checkpoint
955*4882a593Smuzhiyun            processing: any buffers committed as a result of this
956*4882a593Smuzhiyun            transaction can be removed from any checkpoint list it was on
957*4882a593Smuzhiyun            before. */
958*4882a593Smuzhiyun 
959*4882a593Smuzhiyun 	jbd_debug(3, "JBD2: commit phase 6\n");
960*4882a593Smuzhiyun 
961*4882a593Smuzhiyun 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
962*4882a593Smuzhiyun 	J_ASSERT(commit_transaction->t_buffers == NULL);
963*4882a593Smuzhiyun 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
964*4882a593Smuzhiyun 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
965*4882a593Smuzhiyun 
966*4882a593Smuzhiyun restart_loop:
967*4882a593Smuzhiyun 	/*
968*4882a593Smuzhiyun 	 * As there are other places (journal_unmap_buffer()) adding buffers
969*4882a593Smuzhiyun 	 * to this list we have to be careful and hold the j_list_lock.
970*4882a593Smuzhiyun 	 */
971*4882a593Smuzhiyun 	spin_lock(&journal->j_list_lock);
972*4882a593Smuzhiyun 	while (commit_transaction->t_forget) {
973*4882a593Smuzhiyun 		transaction_t *cp_transaction;
974*4882a593Smuzhiyun 		struct buffer_head *bh;
975*4882a593Smuzhiyun 		int try_to_free = 0;
976*4882a593Smuzhiyun 		bool drop_ref;
977*4882a593Smuzhiyun 
978*4882a593Smuzhiyun 		jh = commit_transaction->t_forget;
979*4882a593Smuzhiyun 		spin_unlock(&journal->j_list_lock);
980*4882a593Smuzhiyun 		bh = jh2bh(jh);
981*4882a593Smuzhiyun 		/*
982*4882a593Smuzhiyun 		 * Get a reference so that bh cannot be freed before we are
983*4882a593Smuzhiyun 		 * done with it.
984*4882a593Smuzhiyun 		 */
985*4882a593Smuzhiyun 		get_bh(bh);
986*4882a593Smuzhiyun 		spin_lock(&jh->b_state_lock);
987*4882a593Smuzhiyun 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
988*4882a593Smuzhiyun 
989*4882a593Smuzhiyun 		/*
990*4882a593Smuzhiyun 		 * If there is undo-protected committed data against
991*4882a593Smuzhiyun 		 * this buffer, then we can remove it now.  If it is a
992*4882a593Smuzhiyun 		 * buffer needing such protection, the old frozen_data
993*4882a593Smuzhiyun 		 * field now points to a committed version of the
994*4882a593Smuzhiyun 		 * buffer, so rotate that field to the new committed
995*4882a593Smuzhiyun 		 * data.
996*4882a593Smuzhiyun 		 *
997*4882a593Smuzhiyun 		 * Otherwise, we can just throw away the frozen data now.
998*4882a593Smuzhiyun 		 *
999*4882a593Smuzhiyun 		 * We also know that the frozen data has already fired
1000*4882a593Smuzhiyun 		 * its triggers if they exist, so we can clear that too.
1001*4882a593Smuzhiyun 		 */
1002*4882a593Smuzhiyun 		if (jh->b_committed_data) {
1003*4882a593Smuzhiyun 			jbd2_free(jh->b_committed_data, bh->b_size);
1004*4882a593Smuzhiyun 			jh->b_committed_data = NULL;
1005*4882a593Smuzhiyun 			if (jh->b_frozen_data) {
1006*4882a593Smuzhiyun 				jh->b_committed_data = jh->b_frozen_data;
1007*4882a593Smuzhiyun 				jh->b_frozen_data = NULL;
1008*4882a593Smuzhiyun 				jh->b_frozen_triggers = NULL;
1009*4882a593Smuzhiyun 			}
1010*4882a593Smuzhiyun 		} else if (jh->b_frozen_data) {
1011*4882a593Smuzhiyun 			jbd2_free(jh->b_frozen_data, bh->b_size);
1012*4882a593Smuzhiyun 			jh->b_frozen_data = NULL;
1013*4882a593Smuzhiyun 			jh->b_frozen_triggers = NULL;
1014*4882a593Smuzhiyun 		}
1015*4882a593Smuzhiyun 
1016*4882a593Smuzhiyun 		spin_lock(&journal->j_list_lock);
1017*4882a593Smuzhiyun 		cp_transaction = jh->b_cp_transaction;
1018*4882a593Smuzhiyun 		if (cp_transaction) {
1019*4882a593Smuzhiyun 			JBUFFER_TRACE(jh, "remove from old cp transaction");
1020*4882a593Smuzhiyun 			cp_transaction->t_chp_stats.cs_dropped++;
1021*4882a593Smuzhiyun 			__jbd2_journal_remove_checkpoint(jh);
1022*4882a593Smuzhiyun 		}
1023*4882a593Smuzhiyun 
1024*4882a593Smuzhiyun 		/* Only re-checkpoint the buffer_head if it is marked
1025*4882a593Smuzhiyun 		 * dirty.  If the buffer was added to the BJ_Forget list
1026*4882a593Smuzhiyun 		 * by jbd2_journal_forget, it may no longer be dirty and
1027*4882a593Smuzhiyun 		 * there's no point in keeping a checkpoint record for
1028*4882a593Smuzhiyun 		 * it. */
1029*4882a593Smuzhiyun 
1030*4882a593Smuzhiyun 		/*
1031*4882a593Smuzhiyun 		 * A buffer which has been freed while still being journaled
1032*4882a593Smuzhiyun 		 * by a previous transaction, refile the buffer to BJ_Forget of
1033*4882a593Smuzhiyun 		 * the running transaction. If the just committed transaction
1034*4882a593Smuzhiyun 		 * contains "add to orphan" operation, we can completely
1035*4882a593Smuzhiyun 		 * invalidate the buffer now. We are rather through in that
1036*4882a593Smuzhiyun 		 * since the buffer may be still accessible when blocksize <
1037*4882a593Smuzhiyun 		 * pagesize and it is attached to the last partial page.
1038*4882a593Smuzhiyun 		 */
1039*4882a593Smuzhiyun 		if (buffer_freed(bh) && !jh->b_next_transaction) {
1040*4882a593Smuzhiyun 			struct address_space *mapping;
1041*4882a593Smuzhiyun 
1042*4882a593Smuzhiyun 			clear_buffer_freed(bh);
1043*4882a593Smuzhiyun 			clear_buffer_jbddirty(bh);
1044*4882a593Smuzhiyun 
1045*4882a593Smuzhiyun 			/*
1046*4882a593Smuzhiyun 			 * Block device buffers need to stay mapped all the
1047*4882a593Smuzhiyun 			 * time, so it is enough to clear buffer_jbddirty and
1048*4882a593Smuzhiyun 			 * buffer_freed bits. For the file mapping buffers (i.e.
1049*4882a593Smuzhiyun 			 * journalled data) we need to unmap buffer and clear
1050*4882a593Smuzhiyun 			 * more bits. We also need to be careful about the check
1051*4882a593Smuzhiyun 			 * because the data page mapping can get cleared under
1052*4882a593Smuzhiyun 			 * our hands. Note that if mapping == NULL, we don't
1053*4882a593Smuzhiyun 			 * need to make buffer unmapped because the page is
1054*4882a593Smuzhiyun 			 * already detached from the mapping and buffers cannot
1055*4882a593Smuzhiyun 			 * get reused.
1056*4882a593Smuzhiyun 			 */
1057*4882a593Smuzhiyun 			mapping = READ_ONCE(bh->b_page->mapping);
1058*4882a593Smuzhiyun 			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1059*4882a593Smuzhiyun 				clear_buffer_mapped(bh);
1060*4882a593Smuzhiyun 				clear_buffer_new(bh);
1061*4882a593Smuzhiyun 				clear_buffer_req(bh);
1062*4882a593Smuzhiyun 				bh->b_bdev = NULL;
1063*4882a593Smuzhiyun 			}
1064*4882a593Smuzhiyun 		}
1065*4882a593Smuzhiyun 
1066*4882a593Smuzhiyun 		if (buffer_jbddirty(bh)) {
1067*4882a593Smuzhiyun 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1068*4882a593Smuzhiyun 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1069*4882a593Smuzhiyun 			if (is_journal_aborted(journal))
1070*4882a593Smuzhiyun 				clear_buffer_jbddirty(bh);
1071*4882a593Smuzhiyun 		} else {
1072*4882a593Smuzhiyun 			J_ASSERT_BH(bh, !buffer_dirty(bh));
1073*4882a593Smuzhiyun 			/*
1074*4882a593Smuzhiyun 			 * The buffer on BJ_Forget list and not jbddirty means
1075*4882a593Smuzhiyun 			 * it has been freed by this transaction and hence it
1076*4882a593Smuzhiyun 			 * could not have been reallocated until this
1077*4882a593Smuzhiyun 			 * transaction has committed. *BUT* it could be
1078*4882a593Smuzhiyun 			 * reallocated once we have written all the data to
1079*4882a593Smuzhiyun 			 * disk and before we process the buffer on BJ_Forget
1080*4882a593Smuzhiyun 			 * list.
1081*4882a593Smuzhiyun 			 */
1082*4882a593Smuzhiyun 			if (!jh->b_next_transaction)
1083*4882a593Smuzhiyun 				try_to_free = 1;
1084*4882a593Smuzhiyun 		}
1085*4882a593Smuzhiyun 		JBUFFER_TRACE(jh, "refile or unfile buffer");
1086*4882a593Smuzhiyun 		drop_ref = __jbd2_journal_refile_buffer(jh);
1087*4882a593Smuzhiyun 		spin_unlock(&jh->b_state_lock);
1088*4882a593Smuzhiyun 		if (drop_ref)
1089*4882a593Smuzhiyun 			jbd2_journal_put_journal_head(jh);
1090*4882a593Smuzhiyun 		if (try_to_free)
1091*4882a593Smuzhiyun 			release_buffer_page(bh);	/* Drops bh reference */
1092*4882a593Smuzhiyun 		else
1093*4882a593Smuzhiyun 			__brelse(bh);
1094*4882a593Smuzhiyun 		cond_resched_lock(&journal->j_list_lock);
1095*4882a593Smuzhiyun 	}
1096*4882a593Smuzhiyun 	spin_unlock(&journal->j_list_lock);
1097*4882a593Smuzhiyun 	/*
1098*4882a593Smuzhiyun 	 * This is a bit sleazy.  We use j_list_lock to protect transition
1099*4882a593Smuzhiyun 	 * of a transaction into T_FINISHED state and calling
1100*4882a593Smuzhiyun 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1101*4882a593Smuzhiyun 	 * other checkpointing code processing the transaction...
1102*4882a593Smuzhiyun 	 */
1103*4882a593Smuzhiyun 	write_lock(&journal->j_state_lock);
1104*4882a593Smuzhiyun 	spin_lock(&journal->j_list_lock);
1105*4882a593Smuzhiyun 	/*
1106*4882a593Smuzhiyun 	 * Now recheck if some buffers did not get attached to the transaction
1107*4882a593Smuzhiyun 	 * while the lock was dropped...
1108*4882a593Smuzhiyun 	 */
1109*4882a593Smuzhiyun 	if (commit_transaction->t_forget) {
1110*4882a593Smuzhiyun 		spin_unlock(&journal->j_list_lock);
1111*4882a593Smuzhiyun 		write_unlock(&journal->j_state_lock);
1112*4882a593Smuzhiyun 		goto restart_loop;
1113*4882a593Smuzhiyun 	}
1114*4882a593Smuzhiyun 
1115*4882a593Smuzhiyun 	/* Add the transaction to the checkpoint list
1116*4882a593Smuzhiyun 	 * __journal_remove_checkpoint() can not destroy transaction
1117*4882a593Smuzhiyun 	 * under us because it is not marked as T_FINISHED yet */
1118*4882a593Smuzhiyun 	if (journal->j_checkpoint_transactions == NULL) {
1119*4882a593Smuzhiyun 		journal->j_checkpoint_transactions = commit_transaction;
1120*4882a593Smuzhiyun 		commit_transaction->t_cpnext = commit_transaction;
1121*4882a593Smuzhiyun 		commit_transaction->t_cpprev = commit_transaction;
1122*4882a593Smuzhiyun 	} else {
1123*4882a593Smuzhiyun 		commit_transaction->t_cpnext =
1124*4882a593Smuzhiyun 			journal->j_checkpoint_transactions;
1125*4882a593Smuzhiyun 		commit_transaction->t_cpprev =
1126*4882a593Smuzhiyun 			commit_transaction->t_cpnext->t_cpprev;
1127*4882a593Smuzhiyun 		commit_transaction->t_cpnext->t_cpprev =
1128*4882a593Smuzhiyun 			commit_transaction;
1129*4882a593Smuzhiyun 		commit_transaction->t_cpprev->t_cpnext =
1130*4882a593Smuzhiyun 				commit_transaction;
1131*4882a593Smuzhiyun 	}
1132*4882a593Smuzhiyun 	spin_unlock(&journal->j_list_lock);
1133*4882a593Smuzhiyun 
1134*4882a593Smuzhiyun 	/* Done with this transaction! */
1135*4882a593Smuzhiyun 
1136*4882a593Smuzhiyun 	jbd_debug(3, "JBD2: commit phase 7\n");
1137*4882a593Smuzhiyun 
1138*4882a593Smuzhiyun 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1139*4882a593Smuzhiyun 
1140*4882a593Smuzhiyun 	commit_transaction->t_start = jiffies;
1141*4882a593Smuzhiyun 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1142*4882a593Smuzhiyun 					      commit_transaction->t_start);
1143*4882a593Smuzhiyun 
1144*4882a593Smuzhiyun 	/*
1145*4882a593Smuzhiyun 	 * File the transaction statistics
1146*4882a593Smuzhiyun 	 */
1147*4882a593Smuzhiyun 	stats.ts_tid = commit_transaction->t_tid;
1148*4882a593Smuzhiyun 	stats.run.rs_handle_count =
1149*4882a593Smuzhiyun 		atomic_read(&commit_transaction->t_handle_count);
1150*4882a593Smuzhiyun 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1151*4882a593Smuzhiyun 			     commit_transaction->t_tid, &stats.run);
1152*4882a593Smuzhiyun 	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1153*4882a593Smuzhiyun 
1154*4882a593Smuzhiyun 	commit_transaction->t_state = T_COMMIT_CALLBACK;
1155*4882a593Smuzhiyun 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1156*4882a593Smuzhiyun 	journal->j_commit_sequence = commit_transaction->t_tid;
1157*4882a593Smuzhiyun 	journal->j_committing_transaction = NULL;
1158*4882a593Smuzhiyun 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1159*4882a593Smuzhiyun 
1160*4882a593Smuzhiyun 	/*
1161*4882a593Smuzhiyun 	 * weight the commit time higher than the average time so we don't
1162*4882a593Smuzhiyun 	 * react too strongly to vast changes in the commit time
1163*4882a593Smuzhiyun 	 */
1164*4882a593Smuzhiyun 	if (likely(journal->j_average_commit_time))
1165*4882a593Smuzhiyun 		journal->j_average_commit_time = (commit_time +
1166*4882a593Smuzhiyun 				journal->j_average_commit_time*3) / 4;
1167*4882a593Smuzhiyun 	else
1168*4882a593Smuzhiyun 		journal->j_average_commit_time = commit_time;
1169*4882a593Smuzhiyun 
1170*4882a593Smuzhiyun 	write_unlock(&journal->j_state_lock);
1171*4882a593Smuzhiyun 
1172*4882a593Smuzhiyun 	if (journal->j_commit_callback)
1173*4882a593Smuzhiyun 		journal->j_commit_callback(journal, commit_transaction);
1174*4882a593Smuzhiyun 	if (journal->j_fc_cleanup_callback)
1175*4882a593Smuzhiyun 		journal->j_fc_cleanup_callback(journal, 1);
1176*4882a593Smuzhiyun 
1177*4882a593Smuzhiyun 	trace_jbd2_end_commit(journal, commit_transaction);
1178*4882a593Smuzhiyun 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1179*4882a593Smuzhiyun 		  journal->j_commit_sequence, journal->j_tail_sequence);
1180*4882a593Smuzhiyun 
1181*4882a593Smuzhiyun 	write_lock(&journal->j_state_lock);
1182*4882a593Smuzhiyun 	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1183*4882a593Smuzhiyun 	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1184*4882a593Smuzhiyun 	spin_lock(&journal->j_list_lock);
1185*4882a593Smuzhiyun 	commit_transaction->t_state = T_FINISHED;
1186*4882a593Smuzhiyun 	/* Check if the transaction can be dropped now that we are finished */
1187*4882a593Smuzhiyun 	if (commit_transaction->t_checkpoint_list == NULL &&
1188*4882a593Smuzhiyun 	    commit_transaction->t_checkpoint_io_list == NULL) {
1189*4882a593Smuzhiyun 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1190*4882a593Smuzhiyun 		jbd2_journal_free_transaction(commit_transaction);
1191*4882a593Smuzhiyun 	}
1192*4882a593Smuzhiyun 	spin_unlock(&journal->j_list_lock);
1193*4882a593Smuzhiyun 	write_unlock(&journal->j_state_lock);
1194*4882a593Smuzhiyun 	wake_up(&journal->j_wait_done_commit);
1195*4882a593Smuzhiyun 	wake_up(&journal->j_fc_wait);
1196*4882a593Smuzhiyun 
1197*4882a593Smuzhiyun 	/*
1198*4882a593Smuzhiyun 	 * Calculate overall stats
1199*4882a593Smuzhiyun 	 */
1200*4882a593Smuzhiyun 	spin_lock(&journal->j_history_lock);
1201*4882a593Smuzhiyun 	journal->j_stats.ts_tid++;
1202*4882a593Smuzhiyun 	journal->j_stats.ts_requested += stats.ts_requested;
1203*4882a593Smuzhiyun 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1204*4882a593Smuzhiyun 	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1205*4882a593Smuzhiyun 	journal->j_stats.run.rs_running += stats.run.rs_running;
1206*4882a593Smuzhiyun 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1207*4882a593Smuzhiyun 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1208*4882a593Smuzhiyun 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1209*4882a593Smuzhiyun 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1210*4882a593Smuzhiyun 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1211*4882a593Smuzhiyun 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1212*4882a593Smuzhiyun 	spin_unlock(&journal->j_history_lock);
1213*4882a593Smuzhiyun }
1214