1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun #ifndef _LINUX_IVERSION_H
3*4882a593Smuzhiyun #define _LINUX_IVERSION_H
4*4882a593Smuzhiyun
5*4882a593Smuzhiyun #include <linux/fs.h>
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun /*
8*4882a593Smuzhiyun * The inode->i_version field:
9*4882a593Smuzhiyun * ---------------------------
10*4882a593Smuzhiyun * The change attribute (i_version) is mandated by NFSv4 and is mostly for
11*4882a593Smuzhiyun * knfsd, but is also used for other purposes (e.g. IMA). The i_version must
12*4882a593Smuzhiyun * appear different to observers if there was a change to the inode's data or
13*4882a593Smuzhiyun * metadata since it was last queried.
14*4882a593Smuzhiyun *
15*4882a593Smuzhiyun * Observers see the i_version as a 64-bit number that never decreases. If it
16*4882a593Smuzhiyun * remains the same since it was last checked, then nothing has changed in the
17*4882a593Smuzhiyun * inode. If it's different then something has changed. Observers cannot infer
18*4882a593Smuzhiyun * anything about the nature or magnitude of the changes from the value, only
19*4882a593Smuzhiyun * that the inode has changed in some fashion.
20*4882a593Smuzhiyun *
21*4882a593Smuzhiyun * Not all filesystems properly implement the i_version counter. Subsystems that
22*4882a593Smuzhiyun * want to use i_version field on an inode should first check whether the
23*4882a593Smuzhiyun * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro).
24*4882a593Smuzhiyun *
25*4882a593Smuzhiyun * Those that set SB_I_VERSION will automatically have their i_version counter
26*4882a593Smuzhiyun * incremented on writes to normal files. If the SB_I_VERSION is not set, then
27*4882a593Smuzhiyun * the VFS will not touch it on writes, and the filesystem can use it how it
28*4882a593Smuzhiyun * wishes. Note that the filesystem is always responsible for updating the
29*4882a593Smuzhiyun * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.).
30*4882a593Smuzhiyun * We consider these sorts of filesystems to have a kernel-managed i_version.
31*4882a593Smuzhiyun *
32*4882a593Smuzhiyun * It may be impractical for filesystems to keep i_version updates atomic with
33*4882a593Smuzhiyun * respect to the changes that cause them. They should, however, guarantee
34*4882a593Smuzhiyun * that i_version updates are never visible before the changes that caused
35*4882a593Smuzhiyun * them. Also, i_version updates should never be delayed longer than it takes
36*4882a593Smuzhiyun * the original change to reach disk.
37*4882a593Smuzhiyun *
38*4882a593Smuzhiyun * This implementation uses the low bit in the i_version field as a flag to
39*4882a593Smuzhiyun * track when the value has been queried. If it has not been queried since it
40*4882a593Smuzhiyun * was last incremented, we can skip the increment in most cases.
41*4882a593Smuzhiyun *
42*4882a593Smuzhiyun * In the event that we're updating the ctime, we will usually go ahead and
43*4882a593Smuzhiyun * bump the i_version anyway. Since that has to go to stable storage in some
44*4882a593Smuzhiyun * fashion, we might as well increment it as well.
45*4882a593Smuzhiyun *
46*4882a593Smuzhiyun * With this implementation, the value should always appear to observers to
47*4882a593Smuzhiyun * increase over time if the file has changed. It's recommended to use
48*4882a593Smuzhiyun * inode_eq_iversion() helper to compare values.
49*4882a593Smuzhiyun *
50*4882a593Smuzhiyun * Note that some filesystems (e.g. NFS and AFS) just use the field to store
51*4882a593Smuzhiyun * a server-provided value (for the most part). For that reason, those
52*4882a593Smuzhiyun * filesystems do not set SB_I_VERSION. These filesystems are considered to
53*4882a593Smuzhiyun * have a self-managed i_version.
54*4882a593Smuzhiyun *
55*4882a593Smuzhiyun * Persistently storing the i_version
56*4882a593Smuzhiyun * ----------------------------------
57*4882a593Smuzhiyun * Queries of the i_version field are not gated on them hitting the backing
58*4882a593Smuzhiyun * store. It's always possible that the host could crash after allowing
59*4882a593Smuzhiyun * a query of the value but before it has made it to disk.
60*4882a593Smuzhiyun *
61*4882a593Smuzhiyun * To mitigate this problem, filesystems should always use
62*4882a593Smuzhiyun * inode_set_iversion_queried when loading an existing inode from disk. This
63*4882a593Smuzhiyun * ensures that the next attempted inode increment will result in the value
64*4882a593Smuzhiyun * changing.
65*4882a593Smuzhiyun *
66*4882a593Smuzhiyun * Storing the value to disk therefore does not count as a query, so those
67*4882a593Smuzhiyun * filesystems should use inode_peek_iversion to grab the value to be stored.
68*4882a593Smuzhiyun * There is no need to flag the value as having been queried in that case.
69*4882a593Smuzhiyun */
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun /*
72*4882a593Smuzhiyun * We borrow the lowest bit in the i_version to use as a flag to tell whether
73*4882a593Smuzhiyun * it has been queried since we last incremented it. If it has, then we must
74*4882a593Smuzhiyun * increment it on the next change. After that, we can clear the flag and
75*4882a593Smuzhiyun * avoid incrementing it again until it has again been queried.
76*4882a593Smuzhiyun */
77*4882a593Smuzhiyun #define I_VERSION_QUERIED_SHIFT (1)
78*4882a593Smuzhiyun #define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1))
79*4882a593Smuzhiyun #define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT)
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun /**
82*4882a593Smuzhiyun * inode_set_iversion_raw - set i_version to the specified raw value
83*4882a593Smuzhiyun * @inode: inode to set
84*4882a593Smuzhiyun * @val: new i_version value to set
85*4882a593Smuzhiyun *
86*4882a593Smuzhiyun * Set @inode's i_version field to @val. This function is for use by
87*4882a593Smuzhiyun * filesystems that self-manage the i_version.
88*4882a593Smuzhiyun *
89*4882a593Smuzhiyun * For example, the NFS client stores its NFSv4 change attribute in this way,
90*4882a593Smuzhiyun * and the AFS client stores the data_version from the server here.
91*4882a593Smuzhiyun */
92*4882a593Smuzhiyun static inline void
inode_set_iversion_raw(struct inode * inode,u64 val)93*4882a593Smuzhiyun inode_set_iversion_raw(struct inode *inode, u64 val)
94*4882a593Smuzhiyun {
95*4882a593Smuzhiyun atomic64_set(&inode->i_version, val);
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun /**
99*4882a593Smuzhiyun * inode_peek_iversion_raw - grab a "raw" iversion value
100*4882a593Smuzhiyun * @inode: inode from which i_version should be read
101*4882a593Smuzhiyun *
102*4882a593Smuzhiyun * Grab a "raw" inode->i_version value and return it. The i_version is not
103*4882a593Smuzhiyun * flagged or converted in any way. This is mostly used to access a self-managed
104*4882a593Smuzhiyun * i_version.
105*4882a593Smuzhiyun *
106*4882a593Smuzhiyun * With those filesystems, we want to treat the i_version as an entirely
107*4882a593Smuzhiyun * opaque value.
108*4882a593Smuzhiyun */
109*4882a593Smuzhiyun static inline u64
inode_peek_iversion_raw(const struct inode * inode)110*4882a593Smuzhiyun inode_peek_iversion_raw(const struct inode *inode)
111*4882a593Smuzhiyun {
112*4882a593Smuzhiyun return atomic64_read(&inode->i_version);
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun /**
116*4882a593Smuzhiyun * inode_set_max_iversion_raw - update i_version new value is larger
117*4882a593Smuzhiyun * @inode: inode to set
118*4882a593Smuzhiyun * @val: new i_version to set
119*4882a593Smuzhiyun *
120*4882a593Smuzhiyun * Some self-managed filesystems (e.g Ceph) will only update the i_version
121*4882a593Smuzhiyun * value if the new value is larger than the one we already have.
122*4882a593Smuzhiyun */
123*4882a593Smuzhiyun static inline void
inode_set_max_iversion_raw(struct inode * inode,u64 val)124*4882a593Smuzhiyun inode_set_max_iversion_raw(struct inode *inode, u64 val)
125*4882a593Smuzhiyun {
126*4882a593Smuzhiyun u64 cur, old;
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun cur = inode_peek_iversion_raw(inode);
129*4882a593Smuzhiyun for (;;) {
130*4882a593Smuzhiyun if (cur > val)
131*4882a593Smuzhiyun break;
132*4882a593Smuzhiyun old = atomic64_cmpxchg(&inode->i_version, cur, val);
133*4882a593Smuzhiyun if (likely(old == cur))
134*4882a593Smuzhiyun break;
135*4882a593Smuzhiyun cur = old;
136*4882a593Smuzhiyun }
137*4882a593Smuzhiyun }
138*4882a593Smuzhiyun
139*4882a593Smuzhiyun /**
140*4882a593Smuzhiyun * inode_set_iversion - set i_version to a particular value
141*4882a593Smuzhiyun * @inode: inode to set
142*4882a593Smuzhiyun * @val: new i_version value to set
143*4882a593Smuzhiyun *
144*4882a593Smuzhiyun * Set @inode's i_version field to @val. This function is for filesystems with
145*4882a593Smuzhiyun * a kernel-managed i_version, for initializing a newly-created inode from
146*4882a593Smuzhiyun * scratch.
147*4882a593Smuzhiyun *
148*4882a593Smuzhiyun * In this case, we do not set the QUERIED flag since we know that this value
149*4882a593Smuzhiyun * has never been queried.
150*4882a593Smuzhiyun */
151*4882a593Smuzhiyun static inline void
inode_set_iversion(struct inode * inode,u64 val)152*4882a593Smuzhiyun inode_set_iversion(struct inode *inode, u64 val)
153*4882a593Smuzhiyun {
154*4882a593Smuzhiyun inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT);
155*4882a593Smuzhiyun }
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun /**
158*4882a593Smuzhiyun * inode_set_iversion_queried - set i_version to a particular value as quereied
159*4882a593Smuzhiyun * @inode: inode to set
160*4882a593Smuzhiyun * @val: new i_version value to set
161*4882a593Smuzhiyun *
162*4882a593Smuzhiyun * Set @inode's i_version field to @val, and flag it for increment on the next
163*4882a593Smuzhiyun * change.
164*4882a593Smuzhiyun *
165*4882a593Smuzhiyun * Filesystems that persistently store the i_version on disk should use this
166*4882a593Smuzhiyun * when loading an existing inode from disk.
167*4882a593Smuzhiyun *
168*4882a593Smuzhiyun * When loading in an i_version value from a backing store, we can't be certain
169*4882a593Smuzhiyun * that it wasn't previously viewed before being stored. Thus, we must assume
170*4882a593Smuzhiyun * that it was, to ensure that we don't end up handing out the same value for
171*4882a593Smuzhiyun * different versions of the same inode.
172*4882a593Smuzhiyun */
173*4882a593Smuzhiyun static inline void
inode_set_iversion_queried(struct inode * inode,u64 val)174*4882a593Smuzhiyun inode_set_iversion_queried(struct inode *inode, u64 val)
175*4882a593Smuzhiyun {
176*4882a593Smuzhiyun inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) |
177*4882a593Smuzhiyun I_VERSION_QUERIED);
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun /**
181*4882a593Smuzhiyun * inode_maybe_inc_iversion - increments i_version
182*4882a593Smuzhiyun * @inode: inode with the i_version that should be updated
183*4882a593Smuzhiyun * @force: increment the counter even if it's not necessary?
184*4882a593Smuzhiyun *
185*4882a593Smuzhiyun * Every time the inode is modified, the i_version field must be seen to have
186*4882a593Smuzhiyun * changed by any observer.
187*4882a593Smuzhiyun *
188*4882a593Smuzhiyun * If "force" is set or the QUERIED flag is set, then ensure that we increment
189*4882a593Smuzhiyun * the value, and clear the queried flag.
190*4882a593Smuzhiyun *
191*4882a593Smuzhiyun * In the common case where neither is set, then we can return "false" without
192*4882a593Smuzhiyun * updating i_version.
193*4882a593Smuzhiyun *
194*4882a593Smuzhiyun * If this function returns false, and no other metadata has changed, then we
195*4882a593Smuzhiyun * can avoid logging the metadata.
196*4882a593Smuzhiyun */
197*4882a593Smuzhiyun static inline bool
inode_maybe_inc_iversion(struct inode * inode,bool force)198*4882a593Smuzhiyun inode_maybe_inc_iversion(struct inode *inode, bool force)
199*4882a593Smuzhiyun {
200*4882a593Smuzhiyun u64 cur, old, new;
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun /*
203*4882a593Smuzhiyun * The i_version field is not strictly ordered with any other inode
204*4882a593Smuzhiyun * information, but the legacy inode_inc_iversion code used a spinlock
205*4882a593Smuzhiyun * to serialize increments.
206*4882a593Smuzhiyun *
207*4882a593Smuzhiyun * Here, we add full memory barriers to ensure that any de-facto
208*4882a593Smuzhiyun * ordering with other info is preserved.
209*4882a593Smuzhiyun *
210*4882a593Smuzhiyun * This barrier pairs with the barrier in inode_query_iversion()
211*4882a593Smuzhiyun */
212*4882a593Smuzhiyun smp_mb();
213*4882a593Smuzhiyun cur = inode_peek_iversion_raw(inode);
214*4882a593Smuzhiyun for (;;) {
215*4882a593Smuzhiyun /* If flag is clear then we needn't do anything */
216*4882a593Smuzhiyun if (!force && !(cur & I_VERSION_QUERIED))
217*4882a593Smuzhiyun return false;
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun /* Since lowest bit is flag, add 2 to avoid it */
220*4882a593Smuzhiyun new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun old = atomic64_cmpxchg(&inode->i_version, cur, new);
223*4882a593Smuzhiyun if (likely(old == cur))
224*4882a593Smuzhiyun break;
225*4882a593Smuzhiyun cur = old;
226*4882a593Smuzhiyun }
227*4882a593Smuzhiyun return true;
228*4882a593Smuzhiyun }
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun /**
232*4882a593Smuzhiyun * inode_inc_iversion - forcibly increment i_version
233*4882a593Smuzhiyun * @inode: inode that needs to be updated
234*4882a593Smuzhiyun *
235*4882a593Smuzhiyun * Forcbily increment the i_version field. This always results in a change to
236*4882a593Smuzhiyun * the observable value.
237*4882a593Smuzhiyun */
238*4882a593Smuzhiyun static inline void
inode_inc_iversion(struct inode * inode)239*4882a593Smuzhiyun inode_inc_iversion(struct inode *inode)
240*4882a593Smuzhiyun {
241*4882a593Smuzhiyun inode_maybe_inc_iversion(inode, true);
242*4882a593Smuzhiyun }
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun /**
245*4882a593Smuzhiyun * inode_iversion_need_inc - is the i_version in need of being incremented?
246*4882a593Smuzhiyun * @inode: inode to check
247*4882a593Smuzhiyun *
248*4882a593Smuzhiyun * Returns whether the inode->i_version counter needs incrementing on the next
249*4882a593Smuzhiyun * change. Just fetch the value and check the QUERIED flag.
250*4882a593Smuzhiyun */
251*4882a593Smuzhiyun static inline bool
inode_iversion_need_inc(struct inode * inode)252*4882a593Smuzhiyun inode_iversion_need_inc(struct inode *inode)
253*4882a593Smuzhiyun {
254*4882a593Smuzhiyun return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED;
255*4882a593Smuzhiyun }
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun /**
258*4882a593Smuzhiyun * inode_inc_iversion_raw - forcibly increment raw i_version
259*4882a593Smuzhiyun * @inode: inode that needs to be updated
260*4882a593Smuzhiyun *
261*4882a593Smuzhiyun * Forcbily increment the raw i_version field. This always results in a change
262*4882a593Smuzhiyun * to the raw value.
263*4882a593Smuzhiyun *
264*4882a593Smuzhiyun * NFS will use the i_version field to store the value from the server. It
265*4882a593Smuzhiyun * mostly treats it as opaque, but in the case where it holds a write
266*4882a593Smuzhiyun * delegation, it must increment the value itself. This function does that.
267*4882a593Smuzhiyun */
268*4882a593Smuzhiyun static inline void
inode_inc_iversion_raw(struct inode * inode)269*4882a593Smuzhiyun inode_inc_iversion_raw(struct inode *inode)
270*4882a593Smuzhiyun {
271*4882a593Smuzhiyun atomic64_inc(&inode->i_version);
272*4882a593Smuzhiyun }
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun /**
275*4882a593Smuzhiyun * inode_peek_iversion - read i_version without flagging it to be incremented
276*4882a593Smuzhiyun * @inode: inode from which i_version should be read
277*4882a593Smuzhiyun *
278*4882a593Smuzhiyun * Read the inode i_version counter for an inode without registering it as a
279*4882a593Smuzhiyun * query.
280*4882a593Smuzhiyun *
281*4882a593Smuzhiyun * This is typically used by local filesystems that need to store an i_version
282*4882a593Smuzhiyun * on disk. In that situation, it's not necessary to flag it as having been
283*4882a593Smuzhiyun * viewed, as the result won't be used to gauge changes from that point.
284*4882a593Smuzhiyun */
285*4882a593Smuzhiyun static inline u64
inode_peek_iversion(const struct inode * inode)286*4882a593Smuzhiyun inode_peek_iversion(const struct inode *inode)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT;
289*4882a593Smuzhiyun }
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun /**
292*4882a593Smuzhiyun * inode_query_iversion - read i_version for later use
293*4882a593Smuzhiyun * @inode: inode from which i_version should be read
294*4882a593Smuzhiyun *
295*4882a593Smuzhiyun * Read the inode i_version counter. This should be used by callers that wish
296*4882a593Smuzhiyun * to store the returned i_version for later comparison. This will guarantee
297*4882a593Smuzhiyun * that a later query of the i_version will result in a different value if
298*4882a593Smuzhiyun * anything has changed.
299*4882a593Smuzhiyun *
300*4882a593Smuzhiyun * In this implementation, we fetch the current value, set the QUERIED flag and
301*4882a593Smuzhiyun * then try to swap it into place with a cmpxchg, if it wasn't already set. If
302*4882a593Smuzhiyun * that fails, we try again with the newly fetched value from the cmpxchg.
303*4882a593Smuzhiyun */
304*4882a593Smuzhiyun static inline u64
inode_query_iversion(struct inode * inode)305*4882a593Smuzhiyun inode_query_iversion(struct inode *inode)
306*4882a593Smuzhiyun {
307*4882a593Smuzhiyun u64 cur, old, new;
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun cur = inode_peek_iversion_raw(inode);
310*4882a593Smuzhiyun for (;;) {
311*4882a593Smuzhiyun /* If flag is already set, then no need to swap */
312*4882a593Smuzhiyun if (cur & I_VERSION_QUERIED) {
313*4882a593Smuzhiyun /*
314*4882a593Smuzhiyun * This barrier (and the implicit barrier in the
315*4882a593Smuzhiyun * cmpxchg below) pairs with the barrier in
316*4882a593Smuzhiyun * inode_maybe_inc_iversion().
317*4882a593Smuzhiyun */
318*4882a593Smuzhiyun smp_mb();
319*4882a593Smuzhiyun break;
320*4882a593Smuzhiyun }
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun new = cur | I_VERSION_QUERIED;
323*4882a593Smuzhiyun old = atomic64_cmpxchg(&inode->i_version, cur, new);
324*4882a593Smuzhiyun if (likely(old == cur))
325*4882a593Smuzhiyun break;
326*4882a593Smuzhiyun cur = old;
327*4882a593Smuzhiyun }
328*4882a593Smuzhiyun return cur >> I_VERSION_QUERIED_SHIFT;
329*4882a593Smuzhiyun }
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun /**
332*4882a593Smuzhiyun * inode_eq_iversion_raw - check whether the raw i_version counter has changed
333*4882a593Smuzhiyun * @inode: inode to check
334*4882a593Smuzhiyun * @old: old value to check against its i_version
335*4882a593Smuzhiyun *
336*4882a593Smuzhiyun * Compare the current raw i_version counter with a previous one. Returns true
337*4882a593Smuzhiyun * if they are the same or false if they are different.
338*4882a593Smuzhiyun */
339*4882a593Smuzhiyun static inline bool
inode_eq_iversion_raw(const struct inode * inode,u64 old)340*4882a593Smuzhiyun inode_eq_iversion_raw(const struct inode *inode, u64 old)
341*4882a593Smuzhiyun {
342*4882a593Smuzhiyun return inode_peek_iversion_raw(inode) == old;
343*4882a593Smuzhiyun }
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun /**
346*4882a593Smuzhiyun * inode_eq_iversion - check whether the i_version counter has changed
347*4882a593Smuzhiyun * @inode: inode to check
348*4882a593Smuzhiyun * @old: old value to check against its i_version
349*4882a593Smuzhiyun *
350*4882a593Smuzhiyun * Compare an i_version counter with a previous one. Returns true if they are
351*4882a593Smuzhiyun * the same, and false if they are different.
352*4882a593Smuzhiyun *
353*4882a593Smuzhiyun * Note that we don't need to set the QUERIED flag in this case, as the value
354*4882a593Smuzhiyun * in the inode is not being recorded for later use.
355*4882a593Smuzhiyun */
356*4882a593Smuzhiyun static inline bool
inode_eq_iversion(const struct inode * inode,u64 old)357*4882a593Smuzhiyun inode_eq_iversion(const struct inode *inode, u64 old)
358*4882a593Smuzhiyun {
359*4882a593Smuzhiyun return inode_peek_iversion(inode) == old;
360*4882a593Smuzhiyun }
361*4882a593Smuzhiyun #endif
362