1*4882a593Smuzhiyun======= 2*4882a593SmuzhiyunLocking 3*4882a593Smuzhiyun======= 4*4882a593Smuzhiyun 5*4882a593SmuzhiyunThe text below describes the locking rules for VFS-related methods. 6*4882a593SmuzhiyunIt is (believed to be) up-to-date. *Please*, if you change anything in 7*4882a593Smuzhiyunprototypes or locking protocols - update this file. And update the relevant 8*4882a593Smuzhiyuninstances in the tree, don't leave that to maintainers of filesystems/devices/ 9*4882a593Smuzhiyunetc. At the very least, put the list of dubious cases in the end of this file. 10*4882a593SmuzhiyunDon't turn it into log - maintainers of out-of-the-tree code are supposed to 11*4882a593Smuzhiyunbe able to use diff(1). 12*4882a593Smuzhiyun 13*4882a593SmuzhiyunThing currently missing here: socket operations. Alexey? 14*4882a593Smuzhiyun 15*4882a593Smuzhiyundentry_operations 16*4882a593Smuzhiyun================= 17*4882a593Smuzhiyun 18*4882a593Smuzhiyunprototypes:: 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun int (*d_revalidate)(struct dentry *, unsigned int); 21*4882a593Smuzhiyun int (*d_weak_revalidate)(struct dentry *, unsigned int); 22*4882a593Smuzhiyun int (*d_hash)(const struct dentry *, struct qstr *); 23*4882a593Smuzhiyun int (*d_compare)(const struct dentry *, 24*4882a593Smuzhiyun unsigned int, const char *, const struct qstr *); 25*4882a593Smuzhiyun int (*d_delete)(struct dentry *); 26*4882a593Smuzhiyun int (*d_init)(struct dentry *); 27*4882a593Smuzhiyun void (*d_release)(struct dentry *); 28*4882a593Smuzhiyun void (*d_iput)(struct dentry *, struct inode *); 29*4882a593Smuzhiyun char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); 30*4882a593Smuzhiyun struct vfsmount *(*d_automount)(struct path *path); 31*4882a593Smuzhiyun int (*d_manage)(const struct path *, bool); 32*4882a593Smuzhiyun struct dentry *(*d_real)(struct dentry *, const struct inode *); 33*4882a593Smuzhiyun 34*4882a593Smuzhiyunlocking rules: 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun================== =========== ======== ============== ======== 37*4882a593Smuzhiyunops rename_lock ->d_lock may block rcu-walk 38*4882a593Smuzhiyun================== =========== ======== ============== ======== 39*4882a593Smuzhiyund_revalidate: no no yes (ref-walk) maybe 40*4882a593Smuzhiyund_weak_revalidate: no no yes no 41*4882a593Smuzhiyund_hash no no no maybe 42*4882a593Smuzhiyund_compare: yes no no maybe 43*4882a593Smuzhiyund_delete: no yes no no 44*4882a593Smuzhiyund_init: no no yes no 45*4882a593Smuzhiyund_release: no no yes no 46*4882a593Smuzhiyund_prune: no yes no no 47*4882a593Smuzhiyund_iput: no no yes no 48*4882a593Smuzhiyund_dname: no no no no 49*4882a593Smuzhiyund_automount: no no yes no 50*4882a593Smuzhiyund_manage: no no yes (ref-walk) maybe 51*4882a593Smuzhiyund_real no no yes no 52*4882a593Smuzhiyun================== =========== ======== ============== ======== 53*4882a593Smuzhiyun 54*4882a593Smuzhiyuninode_operations 55*4882a593Smuzhiyun================ 56*4882a593Smuzhiyun 57*4882a593Smuzhiyunprototypes:: 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun int (*create) (struct inode *,struct dentry *,umode_t, bool); 60*4882a593Smuzhiyun struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); 61*4882a593Smuzhiyun int (*link) (struct dentry *,struct inode *,struct dentry *); 62*4882a593Smuzhiyun int (*unlink) (struct inode *,struct dentry *); 63*4882a593Smuzhiyun int (*symlink) (struct inode *,struct dentry *,const char *); 64*4882a593Smuzhiyun int (*mkdir) (struct inode *,struct dentry *,umode_t); 65*4882a593Smuzhiyun int (*rmdir) (struct inode *,struct dentry *); 66*4882a593Smuzhiyun int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); 67*4882a593Smuzhiyun int (*rename) (struct inode *, struct dentry *, 68*4882a593Smuzhiyun struct inode *, struct dentry *, unsigned int); 69*4882a593Smuzhiyun int (*readlink) (struct dentry *, char __user *,int); 70*4882a593Smuzhiyun const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *); 71*4882a593Smuzhiyun void (*truncate) (struct inode *); 72*4882a593Smuzhiyun int (*permission) (struct inode *, int, unsigned int); 73*4882a593Smuzhiyun int (*get_acl)(struct inode *, int); 74*4882a593Smuzhiyun int (*setattr) (struct dentry *, struct iattr *); 75*4882a593Smuzhiyun int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); 76*4882a593Smuzhiyun ssize_t (*listxattr) (struct dentry *, char *, size_t); 77*4882a593Smuzhiyun int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); 78*4882a593Smuzhiyun void (*update_time)(struct inode *, struct timespec *, int); 79*4882a593Smuzhiyun int (*atomic_open)(struct inode *, struct dentry *, 80*4882a593Smuzhiyun struct file *, unsigned open_flag, 81*4882a593Smuzhiyun umode_t create_mode); 82*4882a593Smuzhiyun int (*tmpfile) (struct inode *, struct dentry *, umode_t); 83*4882a593Smuzhiyun 84*4882a593Smuzhiyunlocking rules: 85*4882a593Smuzhiyun all may block 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun============ ============================================= 88*4882a593Smuzhiyunops i_rwsem(inode) 89*4882a593Smuzhiyun============ ============================================= 90*4882a593Smuzhiyunlookup: shared 91*4882a593Smuzhiyuncreate: exclusive 92*4882a593Smuzhiyunlink: exclusive (both) 93*4882a593Smuzhiyunmknod: exclusive 94*4882a593Smuzhiyunsymlink: exclusive 95*4882a593Smuzhiyunmkdir: exclusive 96*4882a593Smuzhiyununlink: exclusive (both) 97*4882a593Smuzhiyunrmdir: exclusive (both)(see below) 98*4882a593Smuzhiyunrename: exclusive (all) (see below) 99*4882a593Smuzhiyunreadlink: no 100*4882a593Smuzhiyunget_link: no 101*4882a593Smuzhiyunsetattr: exclusive 102*4882a593Smuzhiyunpermission: no (may not block if called in rcu-walk mode) 103*4882a593Smuzhiyunget_acl: no 104*4882a593Smuzhiyungetattr: no 105*4882a593Smuzhiyunlistxattr: no 106*4882a593Smuzhiyunfiemap: no 107*4882a593Smuzhiyunupdate_time: no 108*4882a593Smuzhiyunatomic_open: shared (exclusive if O_CREAT is set in open flags) 109*4882a593Smuzhiyuntmpfile: no 110*4882a593Smuzhiyun============ ============================================= 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_rwsem 114*4882a593Smuzhiyun exclusive on victim. 115*4882a593Smuzhiyun cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. 116*4882a593Smuzhiyun 117*4882a593SmuzhiyunSee Documentation/filesystems/directory-locking.rst for more detailed discussion 118*4882a593Smuzhiyunof the locking scheme for directory operations. 119*4882a593Smuzhiyun 120*4882a593Smuzhiyunxattr_handler operations 121*4882a593Smuzhiyun======================== 122*4882a593Smuzhiyun 123*4882a593Smuzhiyunprototypes:: 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun bool (*list)(struct dentry *dentry); 126*4882a593Smuzhiyun int (*get)(const struct xattr_handler *handler, struct dentry *dentry, 127*4882a593Smuzhiyun struct inode *inode, const char *name, void *buffer, 128*4882a593Smuzhiyun size_t size, int flags); 129*4882a593Smuzhiyun int (*set)(const struct xattr_handler *handler, struct dentry *dentry, 130*4882a593Smuzhiyun struct inode *inode, const char *name, const void *buffer, 131*4882a593Smuzhiyun size_t size, int flags); 132*4882a593Smuzhiyun 133*4882a593Smuzhiyunlocking rules: 134*4882a593Smuzhiyun all may block 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun===== ============== 137*4882a593Smuzhiyunops i_rwsem(inode) 138*4882a593Smuzhiyun===== ============== 139*4882a593Smuzhiyunlist: no 140*4882a593Smuzhiyunget: no 141*4882a593Smuzhiyunset: exclusive 142*4882a593Smuzhiyun===== ============== 143*4882a593Smuzhiyun 144*4882a593Smuzhiyunsuper_operations 145*4882a593Smuzhiyun================ 146*4882a593Smuzhiyun 147*4882a593Smuzhiyunprototypes:: 148*4882a593Smuzhiyun 149*4882a593Smuzhiyun struct inode *(*alloc_inode)(struct super_block *sb); 150*4882a593Smuzhiyun void (*free_inode)(struct inode *); 151*4882a593Smuzhiyun void (*destroy_inode)(struct inode *); 152*4882a593Smuzhiyun void (*dirty_inode) (struct inode *, int flags); 153*4882a593Smuzhiyun int (*write_inode) (struct inode *, struct writeback_control *wbc); 154*4882a593Smuzhiyun int (*drop_inode) (struct inode *); 155*4882a593Smuzhiyun void (*evict_inode) (struct inode *); 156*4882a593Smuzhiyun void (*put_super) (struct super_block *); 157*4882a593Smuzhiyun int (*sync_fs)(struct super_block *sb, int wait); 158*4882a593Smuzhiyun int (*freeze_fs) (struct super_block *); 159*4882a593Smuzhiyun int (*unfreeze_fs) (struct super_block *); 160*4882a593Smuzhiyun int (*statfs) (struct dentry *, struct kstatfs *); 161*4882a593Smuzhiyun int (*remount_fs) (struct super_block *, int *, char *); 162*4882a593Smuzhiyun void (*umount_begin) (struct super_block *); 163*4882a593Smuzhiyun int (*show_options)(struct seq_file *, struct dentry *); 164*4882a593Smuzhiyun ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 165*4882a593Smuzhiyun ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 166*4882a593Smuzhiyun int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); 167*4882a593Smuzhiyun 168*4882a593Smuzhiyunlocking rules: 169*4882a593Smuzhiyun All may block [not true, see below] 170*4882a593Smuzhiyun 171*4882a593Smuzhiyun====================== ============ ======================== 172*4882a593Smuzhiyunops s_umount note 173*4882a593Smuzhiyun====================== ============ ======================== 174*4882a593Smuzhiyunalloc_inode: 175*4882a593Smuzhiyunfree_inode: called from RCU callback 176*4882a593Smuzhiyundestroy_inode: 177*4882a593Smuzhiyundirty_inode: 178*4882a593Smuzhiyunwrite_inode: 179*4882a593Smuzhiyundrop_inode: !!!inode->i_lock!!! 180*4882a593Smuzhiyunevict_inode: 181*4882a593Smuzhiyunput_super: write 182*4882a593Smuzhiyunsync_fs: read 183*4882a593Smuzhiyunfreeze_fs: write 184*4882a593Smuzhiyununfreeze_fs: write 185*4882a593Smuzhiyunstatfs: maybe(read) (see below) 186*4882a593Smuzhiyunremount_fs: write 187*4882a593Smuzhiyunumount_begin: no 188*4882a593Smuzhiyunshow_options: no (namespace_sem) 189*4882a593Smuzhiyunquota_read: no (see below) 190*4882a593Smuzhiyunquota_write: no (see below) 191*4882a593Smuzhiyunbdev_try_to_free_page: no (see below) 192*4882a593Smuzhiyun====================== ============ ======================== 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun->statfs() has s_umount (shared) when called by ustat(2) (native or 195*4882a593Smuzhiyuncompat), but that's an accident of bad API; s_umount is used to pin 196*4882a593Smuzhiyunthe superblock down when we only have dev_t given us by userland to 197*4882a593Smuzhiyunidentify the superblock. Everything else (statfs(), fstatfs(), etc.) 198*4882a593Smuzhiyundoesn't hold it when calling ->statfs() - superblock is pinned down 199*4882a593Smuzhiyunby resolving the pathname passed to syscall. 200*4882a593Smuzhiyun 201*4882a593Smuzhiyun->quota_read() and ->quota_write() functions are both guaranteed to 202*4882a593Smuzhiyunbe the only ones operating on the quota file by the quota code (via 203*4882a593Smuzhiyundqio_sem) (unless an admin really wants to screw up something and 204*4882a593Smuzhiyunwrites to quota files with quotas on). For other details about locking 205*4882a593Smuzhiyunsee also dquot_operations section. 206*4882a593Smuzhiyun 207*4882a593Smuzhiyun->bdev_try_to_free_page is called from the ->releasepage handler of 208*4882a593Smuzhiyunthe block device inode. See there for more details. 209*4882a593Smuzhiyun 210*4882a593Smuzhiyunfile_system_type 211*4882a593Smuzhiyun================ 212*4882a593Smuzhiyun 213*4882a593Smuzhiyunprototypes:: 214*4882a593Smuzhiyun 215*4882a593Smuzhiyun struct dentry *(*mount) (struct file_system_type *, int, 216*4882a593Smuzhiyun const char *, void *); 217*4882a593Smuzhiyun void (*kill_sb) (struct super_block *); 218*4882a593Smuzhiyun 219*4882a593Smuzhiyunlocking rules: 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun======= ========= 222*4882a593Smuzhiyunops may block 223*4882a593Smuzhiyun======= ========= 224*4882a593Smuzhiyunmount yes 225*4882a593Smuzhiyunkill_sb yes 226*4882a593Smuzhiyun======= ========= 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun->mount() returns ERR_PTR or the root dentry; its superblock should be locked 229*4882a593Smuzhiyunon return. 230*4882a593Smuzhiyun 231*4882a593Smuzhiyun->kill_sb() takes a write-locked superblock, does all shutdown work on it, 232*4882a593Smuzhiyununlocks and drops the reference. 233*4882a593Smuzhiyun 234*4882a593Smuzhiyunaddress_space_operations 235*4882a593Smuzhiyun======================== 236*4882a593Smuzhiyunprototypes:: 237*4882a593Smuzhiyun 238*4882a593Smuzhiyun int (*writepage)(struct page *page, struct writeback_control *wbc); 239*4882a593Smuzhiyun int (*readpage)(struct file *, struct page *); 240*4882a593Smuzhiyun int (*writepages)(struct address_space *, struct writeback_control *); 241*4882a593Smuzhiyun int (*set_page_dirty)(struct page *page); 242*4882a593Smuzhiyun void (*readahead)(struct readahead_control *); 243*4882a593Smuzhiyun int (*readpages)(struct file *filp, struct address_space *mapping, 244*4882a593Smuzhiyun struct list_head *pages, unsigned nr_pages); 245*4882a593Smuzhiyun int (*write_begin)(struct file *, struct address_space *mapping, 246*4882a593Smuzhiyun loff_t pos, unsigned len, unsigned flags, 247*4882a593Smuzhiyun struct page **pagep, void **fsdata); 248*4882a593Smuzhiyun int (*write_end)(struct file *, struct address_space *mapping, 249*4882a593Smuzhiyun loff_t pos, unsigned len, unsigned copied, 250*4882a593Smuzhiyun struct page *page, void *fsdata); 251*4882a593Smuzhiyun sector_t (*bmap)(struct address_space *, sector_t); 252*4882a593Smuzhiyun void (*invalidatepage) (struct page *, unsigned int, unsigned int); 253*4882a593Smuzhiyun int (*releasepage) (struct page *, int); 254*4882a593Smuzhiyun void (*freepage)(struct page *); 255*4882a593Smuzhiyun int (*direct_IO)(struct kiocb *, struct iov_iter *iter); 256*4882a593Smuzhiyun bool (*isolate_page) (struct page *, isolate_mode_t); 257*4882a593Smuzhiyun int (*migratepage)(struct address_space *, struct page *, struct page *); 258*4882a593Smuzhiyun void (*putback_page) (struct page *); 259*4882a593Smuzhiyun int (*launder_page)(struct page *); 260*4882a593Smuzhiyun int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); 261*4882a593Smuzhiyun int (*error_remove_page)(struct address_space *, struct page *); 262*4882a593Smuzhiyun int (*swap_activate)(struct file *); 263*4882a593Smuzhiyun int (*swap_deactivate)(struct file *); 264*4882a593Smuzhiyun 265*4882a593Smuzhiyunlocking rules: 266*4882a593Smuzhiyun All except set_page_dirty and freepage may block 267*4882a593Smuzhiyun 268*4882a593Smuzhiyun====================== ======================== ========= 269*4882a593Smuzhiyunops PageLocked(page) i_rwsem 270*4882a593Smuzhiyun====================== ======================== ========= 271*4882a593Smuzhiyunwritepage: yes, unlocks (see below) 272*4882a593Smuzhiyunreadpage: yes, unlocks 273*4882a593Smuzhiyunwritepages: 274*4882a593Smuzhiyunset_page_dirty no 275*4882a593Smuzhiyunreadahead: yes, unlocks 276*4882a593Smuzhiyunreadpages: no 277*4882a593Smuzhiyunwrite_begin: locks the page exclusive 278*4882a593Smuzhiyunwrite_end: yes, unlocks exclusive 279*4882a593Smuzhiyunbmap: 280*4882a593Smuzhiyuninvalidatepage: yes 281*4882a593Smuzhiyunreleasepage: yes 282*4882a593Smuzhiyunfreepage: yes 283*4882a593Smuzhiyundirect_IO: 284*4882a593Smuzhiyunisolate_page: yes 285*4882a593Smuzhiyunmigratepage: yes (both) 286*4882a593Smuzhiyunputback_page: yes 287*4882a593Smuzhiyunlaunder_page: yes 288*4882a593Smuzhiyunis_partially_uptodate: yes 289*4882a593Smuzhiyunerror_remove_page: yes 290*4882a593Smuzhiyunswap_activate: no 291*4882a593Smuzhiyunswap_deactivate: no 292*4882a593Smuzhiyun====================== ======================== ========= 293*4882a593Smuzhiyun 294*4882a593Smuzhiyun->write_begin(), ->write_end() and ->readpage() may be called from 295*4882a593Smuzhiyunthe request handler (/dev/loop). 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun->readpage() unlocks the page, either synchronously or via I/O 298*4882a593Smuzhiyuncompletion. 299*4882a593Smuzhiyun 300*4882a593Smuzhiyun->readahead() unlocks the pages that I/O is attempted on like ->readpage(). 301*4882a593Smuzhiyun 302*4882a593Smuzhiyun->readpages() populates the pagecache with the passed pages and starts 303*4882a593SmuzhiyunI/O against them. They come unlocked upon I/O completion. 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun->writepage() is used for two purposes: for "memory cleansing" and for 306*4882a593Smuzhiyun"sync". These are quite different operations and the behaviour may differ 307*4882a593Smuzhiyundepending upon the mode. 308*4882a593Smuzhiyun 309*4882a593SmuzhiyunIf writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then 310*4882a593Smuzhiyunit *must* start I/O against the page, even if that would involve 311*4882a593Smuzhiyunblocking on in-progress I/O. 312*4882a593Smuzhiyun 313*4882a593SmuzhiyunIf writepage is called for memory cleansing (sync_mode == 314*4882a593SmuzhiyunWBC_SYNC_NONE) then its role is to get as much writeout underway as 315*4882a593Smuzhiyunpossible. So writepage should try to avoid blocking against 316*4882a593Smuzhiyuncurrently-in-progress I/O. 317*4882a593Smuzhiyun 318*4882a593SmuzhiyunIf the filesystem is not called for "sync" and it determines that it 319*4882a593Smuzhiyunwould need to block against in-progress I/O to be able to start new I/O 320*4882a593Smuzhiyunagainst the page the filesystem should redirty the page with 321*4882a593Smuzhiyunredirty_page_for_writepage(), then unlock the page and return zero. 322*4882a593SmuzhiyunThis may also be done to avoid internal deadlocks, but rarely. 323*4882a593Smuzhiyun 324*4882a593SmuzhiyunIf the filesystem is called for sync then it must wait on any 325*4882a593Smuzhiyunin-progress I/O and then start new I/O. 326*4882a593Smuzhiyun 327*4882a593SmuzhiyunThe filesystem should unlock the page synchronously, before returning to the 328*4882a593Smuzhiyuncaller, unless ->writepage() returns special WRITEPAGE_ACTIVATE 329*4882a593Smuzhiyunvalue. WRITEPAGE_ACTIVATE means that page cannot really be written out 330*4882a593Smuzhiyuncurrently, and VM should stop calling ->writepage() on this page for some 331*4882a593Smuzhiyuntime. VM does this by moving page to the head of the active list, hence the 332*4882a593Smuzhiyunname. 333*4882a593Smuzhiyun 334*4882a593SmuzhiyunUnless the filesystem is going to redirty_page_for_writepage(), unlock the page 335*4882a593Smuzhiyunand return zero, writepage *must* run set_page_writeback() against the page, 336*4882a593Smuzhiyunfollowed by unlocking it. Once set_page_writeback() has been run against the 337*4882a593Smuzhiyunpage, write I/O can be submitted and the write I/O completion handler must run 338*4882a593Smuzhiyunend_page_writeback() once the I/O is complete. If no I/O is submitted, the 339*4882a593Smuzhiyunfilesystem must run end_page_writeback() against the page before returning from 340*4882a593Smuzhiyunwritepage. 341*4882a593Smuzhiyun 342*4882a593SmuzhiyunThat is: after 2.5.12, pages which are under writeout are *not* locked. Note, 343*4882a593Smuzhiyunif the filesystem needs the page to be locked during writeout, that is ok, too, 344*4882a593Smuzhiyunthe page is allowed to be unlocked at any point in time between the calls to 345*4882a593Smuzhiyunset_page_writeback() and end_page_writeback(). 346*4882a593Smuzhiyun 347*4882a593SmuzhiyunNote, failure to run either redirty_page_for_writepage() or the combination of 348*4882a593Smuzhiyunset_page_writeback()/end_page_writeback() on a page submitted to writepage 349*4882a593Smuzhiyunwill leave the page itself marked clean but it will be tagged as dirty in the 350*4882a593Smuzhiyunradix tree. This incoherency can lead to all sorts of hard-to-debug problems 351*4882a593Smuzhiyunin the filesystem like having dirty inodes at umount and losing written data. 352*4882a593Smuzhiyun 353*4882a593Smuzhiyun->writepages() is used for periodic writeback and for syscall-initiated 354*4882a593Smuzhiyunsync operations. The address_space should start I/O against at least 355*4882a593Smuzhiyun``*nr_to_write`` pages. ``*nr_to_write`` must be decremented for each page 356*4882a593Smuzhiyunwhich is written. The address_space implementation may write more (or less) 357*4882a593Smuzhiyunpages than ``*nr_to_write`` asks for, but it should try to be reasonably close. 358*4882a593SmuzhiyunIf nr_to_write is NULL, all dirty pages must be written. 359*4882a593Smuzhiyun 360*4882a593Smuzhiyunwritepages should _only_ write pages which are present on 361*4882a593Smuzhiyunmapping->io_pages. 362*4882a593Smuzhiyun 363*4882a593Smuzhiyun->set_page_dirty() is called from various places in the kernel 364*4882a593Smuzhiyunwhen the target page is marked as needing writeback. It may be called 365*4882a593Smuzhiyununder spinlock (it cannot block) and is sometimes called with the page 366*4882a593Smuzhiyunnot locked. 367*4882a593Smuzhiyun 368*4882a593Smuzhiyun->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some 369*4882a593Smuzhiyunfilesystems and by the swapper. The latter will eventually go away. Please, 370*4882a593Smuzhiyunkeep it that way and don't breed new callers. 371*4882a593Smuzhiyun 372*4882a593Smuzhiyun->invalidatepage() is called when the filesystem must attempt to drop 373*4882a593Smuzhiyunsome or all of the buffers from the page when it is being truncated. It 374*4882a593Smuzhiyunreturns zero on success. If ->invalidatepage is zero, the kernel uses 375*4882a593Smuzhiyunblock_invalidatepage() instead. 376*4882a593Smuzhiyun 377*4882a593Smuzhiyun->releasepage() is called when the kernel is about to try to drop the 378*4882a593Smuzhiyunbuffers from the page in preparation for freeing it. It returns zero to 379*4882a593Smuzhiyunindicate that the buffers are (or may be) freeable. If ->releasepage is zero, 380*4882a593Smuzhiyunthe kernel assumes that the fs has no private interest in the buffers. 381*4882a593Smuzhiyun 382*4882a593Smuzhiyun->freepage() is called when the kernel is done dropping the page 383*4882a593Smuzhiyunfrom the page cache. 384*4882a593Smuzhiyun 385*4882a593Smuzhiyun->launder_page() may be called prior to releasing a page if 386*4882a593Smuzhiyunit is still found to be dirty. It returns zero if the page was successfully 387*4882a593Smuzhiyuncleaned, or an error value if not. Note that in order to prevent the page 388*4882a593Smuzhiyungetting mapped back in and redirtied, it needs to be kept locked 389*4882a593Smuzhiyunacross the entire operation. 390*4882a593Smuzhiyun 391*4882a593Smuzhiyun->swap_activate will be called with a non-zero argument on 392*4882a593Smuzhiyunfiles backing (non block device backed) swapfiles. A return value 393*4882a593Smuzhiyunof zero indicates success, in which case this file can be used for 394*4882a593Smuzhiyunbacking swapspace. The swapspace operations will be proxied to the 395*4882a593Smuzhiyunaddress space operations. 396*4882a593Smuzhiyun 397*4882a593Smuzhiyun->swap_deactivate() will be called in the sys_swapoff() 398*4882a593Smuzhiyunpath after ->swap_activate() returned success. 399*4882a593Smuzhiyun 400*4882a593Smuzhiyunfile_lock_operations 401*4882a593Smuzhiyun==================== 402*4882a593Smuzhiyun 403*4882a593Smuzhiyunprototypes:: 404*4882a593Smuzhiyun 405*4882a593Smuzhiyun void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 406*4882a593Smuzhiyun void (*fl_release_private)(struct file_lock *); 407*4882a593Smuzhiyun 408*4882a593Smuzhiyun 409*4882a593Smuzhiyunlocking rules: 410*4882a593Smuzhiyun 411*4882a593Smuzhiyun=================== ============= ========= 412*4882a593Smuzhiyunops inode->i_lock may block 413*4882a593Smuzhiyun=================== ============= ========= 414*4882a593Smuzhiyunfl_copy_lock: yes no 415*4882a593Smuzhiyunfl_release_private: maybe maybe[1]_ 416*4882a593Smuzhiyun=================== ============= ========= 417*4882a593Smuzhiyun 418*4882a593Smuzhiyun.. [1]: 419*4882a593Smuzhiyun ->fl_release_private for flock or POSIX locks is currently allowed 420*4882a593Smuzhiyun to block. Leases however can still be freed while the i_lock is held and 421*4882a593Smuzhiyun so fl_release_private called on a lease should not block. 422*4882a593Smuzhiyun 423*4882a593Smuzhiyunlock_manager_operations 424*4882a593Smuzhiyun======================= 425*4882a593Smuzhiyun 426*4882a593Smuzhiyunprototypes:: 427*4882a593Smuzhiyun 428*4882a593Smuzhiyun void (*lm_notify)(struct file_lock *); /* unblock callback */ 429*4882a593Smuzhiyun int (*lm_grant)(struct file_lock *, struct file_lock *, int); 430*4882a593Smuzhiyun void (*lm_break)(struct file_lock *); /* break_lease callback */ 431*4882a593Smuzhiyun int (*lm_change)(struct file_lock **, int); 432*4882a593Smuzhiyun bool (*lm_breaker_owns_lease)(struct file_lock *); 433*4882a593Smuzhiyun 434*4882a593Smuzhiyunlocking rules: 435*4882a593Smuzhiyun 436*4882a593Smuzhiyun====================== ============= ================= ========= 437*4882a593Smuzhiyunops inode->i_lock blocked_lock_lock may block 438*4882a593Smuzhiyun====================== ============= ================= ========= 439*4882a593Smuzhiyunlm_notify: yes yes no 440*4882a593Smuzhiyunlm_grant: no no no 441*4882a593Smuzhiyunlm_break: yes no no 442*4882a593Smuzhiyunlm_change yes no no 443*4882a593Smuzhiyunlm_breaker_owns_lease: no no no 444*4882a593Smuzhiyun====================== ============= ================= ========= 445*4882a593Smuzhiyun 446*4882a593Smuzhiyunbuffer_head 447*4882a593Smuzhiyun=========== 448*4882a593Smuzhiyun 449*4882a593Smuzhiyunprototypes:: 450*4882a593Smuzhiyun 451*4882a593Smuzhiyun void (*b_end_io)(struct buffer_head *bh, int uptodate); 452*4882a593Smuzhiyun 453*4882a593Smuzhiyunlocking rules: 454*4882a593Smuzhiyun 455*4882a593Smuzhiyuncalled from interrupts. In other words, extreme care is needed here. 456*4882a593Smuzhiyunbh is locked, but that's all warranties we have here. Currently only RAID1, 457*4882a593Smuzhiyunhighmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices 458*4882a593Smuzhiyuncall this method upon the IO completion. 459*4882a593Smuzhiyun 460*4882a593Smuzhiyunblock_device_operations 461*4882a593Smuzhiyun======================= 462*4882a593Smuzhiyunprototypes:: 463*4882a593Smuzhiyun 464*4882a593Smuzhiyun int (*open) (struct block_device *, fmode_t); 465*4882a593Smuzhiyun int (*release) (struct gendisk *, fmode_t); 466*4882a593Smuzhiyun int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 467*4882a593Smuzhiyun int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 468*4882a593Smuzhiyun int (*direct_access) (struct block_device *, sector_t, void **, 469*4882a593Smuzhiyun unsigned long *); 470*4882a593Smuzhiyun void (*unlock_native_capacity) (struct gendisk *); 471*4882a593Smuzhiyun int (*revalidate_disk) (struct gendisk *); 472*4882a593Smuzhiyun int (*getgeo)(struct block_device *, struct hd_geometry *); 473*4882a593Smuzhiyun void (*swap_slot_free_notify) (struct block_device *, unsigned long); 474*4882a593Smuzhiyun 475*4882a593Smuzhiyunlocking rules: 476*4882a593Smuzhiyun 477*4882a593Smuzhiyun======================= =================== 478*4882a593Smuzhiyunops bd_mutex 479*4882a593Smuzhiyun======================= =================== 480*4882a593Smuzhiyunopen: yes 481*4882a593Smuzhiyunrelease: yes 482*4882a593Smuzhiyunioctl: no 483*4882a593Smuzhiyuncompat_ioctl: no 484*4882a593Smuzhiyundirect_access: no 485*4882a593Smuzhiyununlock_native_capacity: no 486*4882a593Smuzhiyunrevalidate_disk: no 487*4882a593Smuzhiyungetgeo: no 488*4882a593Smuzhiyunswap_slot_free_notify: no (see below) 489*4882a593Smuzhiyun======================= =================== 490*4882a593Smuzhiyun 491*4882a593Smuzhiyunswap_slot_free_notify is called with swap_lock and sometimes the page lock 492*4882a593Smuzhiyunheld. 493*4882a593Smuzhiyun 494*4882a593Smuzhiyun 495*4882a593Smuzhiyunfile_operations 496*4882a593Smuzhiyun=============== 497*4882a593Smuzhiyun 498*4882a593Smuzhiyunprototypes:: 499*4882a593Smuzhiyun 500*4882a593Smuzhiyun loff_t (*llseek) (struct file *, loff_t, int); 501*4882a593Smuzhiyun ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); 502*4882a593Smuzhiyun ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); 503*4882a593Smuzhiyun ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); 504*4882a593Smuzhiyun ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); 505*4882a593Smuzhiyun int (*iterate) (struct file *, struct dir_context *); 506*4882a593Smuzhiyun int (*iterate_shared) (struct file *, struct dir_context *); 507*4882a593Smuzhiyun __poll_t (*poll) (struct file *, struct poll_table_struct *); 508*4882a593Smuzhiyun long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 509*4882a593Smuzhiyun long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 510*4882a593Smuzhiyun int (*mmap) (struct file *, struct vm_area_struct *); 511*4882a593Smuzhiyun int (*open) (struct inode *, struct file *); 512*4882a593Smuzhiyun int (*flush) (struct file *); 513*4882a593Smuzhiyun int (*release) (struct inode *, struct file *); 514*4882a593Smuzhiyun int (*fsync) (struct file *, loff_t start, loff_t end, int datasync); 515*4882a593Smuzhiyun int (*fasync) (int, struct file *, int); 516*4882a593Smuzhiyun int (*lock) (struct file *, int, struct file_lock *); 517*4882a593Smuzhiyun ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, 518*4882a593Smuzhiyun loff_t *); 519*4882a593Smuzhiyun ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, 520*4882a593Smuzhiyun loff_t *); 521*4882a593Smuzhiyun ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, 522*4882a593Smuzhiyun void __user *); 523*4882a593Smuzhiyun ssize_t (*sendpage) (struct file *, struct page *, int, size_t, 524*4882a593Smuzhiyun loff_t *, int); 525*4882a593Smuzhiyun unsigned long (*get_unmapped_area)(struct file *, unsigned long, 526*4882a593Smuzhiyun unsigned long, unsigned long, unsigned long); 527*4882a593Smuzhiyun int (*check_flags)(int); 528*4882a593Smuzhiyun int (*flock) (struct file *, int, struct file_lock *); 529*4882a593Smuzhiyun ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, 530*4882a593Smuzhiyun size_t, unsigned int); 531*4882a593Smuzhiyun ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, 532*4882a593Smuzhiyun size_t, unsigned int); 533*4882a593Smuzhiyun int (*setlease)(struct file *, long, struct file_lock **, void **); 534*4882a593Smuzhiyun long (*fallocate)(struct file *, int, loff_t, loff_t); 535*4882a593Smuzhiyun 536*4882a593Smuzhiyunlocking rules: 537*4882a593Smuzhiyun All may block. 538*4882a593Smuzhiyun 539*4882a593Smuzhiyun->llseek() locking has moved from llseek to the individual llseek 540*4882a593Smuzhiyunimplementations. If your fs is not using generic_file_llseek, you 541*4882a593Smuzhiyunneed to acquire and release the appropriate locks in your ->llseek(). 542*4882a593SmuzhiyunFor many filesystems, it is probably safe to acquire the inode 543*4882a593Smuzhiyunmutex or just to use i_size_read() instead. 544*4882a593SmuzhiyunNote: this does not protect the file->f_pos against concurrent modifications 545*4882a593Smuzhiyunsince this is something the userspace has to take care about. 546*4882a593Smuzhiyun 547*4882a593Smuzhiyun->iterate() is called with i_rwsem exclusive. 548*4882a593Smuzhiyun 549*4882a593Smuzhiyun->iterate_shared() is called with i_rwsem at least shared. 550*4882a593Smuzhiyun 551*4882a593Smuzhiyun->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. 552*4882a593SmuzhiyunMost instances call fasync_helper(), which does that maintenance, so it's 553*4882a593Smuzhiyunnot normally something one needs to worry about. Return values > 0 will be 554*4882a593Smuzhiyunmapped to zero in the VFS layer. 555*4882a593Smuzhiyun 556*4882a593Smuzhiyun->readdir() and ->ioctl() on directories must be changed. Ideally we would 557*4882a593Smuzhiyunmove ->readdir() to inode_operations and use a separate method for directory 558*4882a593Smuzhiyun->ioctl() or kill the latter completely. One of the problems is that for 559*4882a593Smuzhiyunanything that resembles union-mount we won't have a struct file for all 560*4882a593Smuzhiyuncomponents. And there are other reasons why the current interface is a mess... 561*4882a593Smuzhiyun 562*4882a593Smuzhiyun->read on directories probably must go away - we should just enforce -EISDIR 563*4882a593Smuzhiyunin sys_read() and friends. 564*4882a593Smuzhiyun 565*4882a593Smuzhiyun->setlease operations should call generic_setlease() before or after setting 566*4882a593Smuzhiyunthe lease within the individual filesystem to record the result of the 567*4882a593Smuzhiyunoperation 568*4882a593Smuzhiyun 569*4882a593Smuzhiyundquot_operations 570*4882a593Smuzhiyun================ 571*4882a593Smuzhiyun 572*4882a593Smuzhiyunprototypes:: 573*4882a593Smuzhiyun 574*4882a593Smuzhiyun int (*write_dquot) (struct dquot *); 575*4882a593Smuzhiyun int (*acquire_dquot) (struct dquot *); 576*4882a593Smuzhiyun int (*release_dquot) (struct dquot *); 577*4882a593Smuzhiyun int (*mark_dirty) (struct dquot *); 578*4882a593Smuzhiyun int (*write_info) (struct super_block *, int); 579*4882a593Smuzhiyun 580*4882a593SmuzhiyunThese operations are intended to be more or less wrapping functions that ensure 581*4882a593Smuzhiyuna proper locking wrt the filesystem and call the generic quota operations. 582*4882a593Smuzhiyun 583*4882a593SmuzhiyunWhat filesystem should expect from the generic quota functions: 584*4882a593Smuzhiyun 585*4882a593Smuzhiyun============== ============ ========================= 586*4882a593Smuzhiyunops FS recursion Held locks when called 587*4882a593Smuzhiyun============== ============ ========================= 588*4882a593Smuzhiyunwrite_dquot: yes dqonoff_sem or dqptr_sem 589*4882a593Smuzhiyunacquire_dquot: yes dqonoff_sem or dqptr_sem 590*4882a593Smuzhiyunrelease_dquot: yes dqonoff_sem or dqptr_sem 591*4882a593Smuzhiyunmark_dirty: no - 592*4882a593Smuzhiyunwrite_info: yes dqonoff_sem 593*4882a593Smuzhiyun============== ============ ========================= 594*4882a593Smuzhiyun 595*4882a593SmuzhiyunFS recursion means calling ->quota_read() and ->quota_write() from superblock 596*4882a593Smuzhiyunoperations. 597*4882a593Smuzhiyun 598*4882a593SmuzhiyunMore details about quota locking can be found in fs/dquot.c. 599*4882a593Smuzhiyun 600*4882a593Smuzhiyunvm_operations_struct 601*4882a593Smuzhiyun==================== 602*4882a593Smuzhiyun 603*4882a593Smuzhiyunprototypes:: 604*4882a593Smuzhiyun 605*4882a593Smuzhiyun void (*open)(struct vm_area_struct*); 606*4882a593Smuzhiyun void (*close)(struct vm_area_struct*); 607*4882a593Smuzhiyun vm_fault_t (*fault)(struct vm_area_struct*, struct vm_fault *); 608*4882a593Smuzhiyun vm_fault_t (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); 609*4882a593Smuzhiyun vm_fault_t (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); 610*4882a593Smuzhiyun int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); 611*4882a593Smuzhiyun 612*4882a593Smuzhiyunlocking rules: 613*4882a593Smuzhiyun 614*4882a593Smuzhiyun============= ========= =========================== 615*4882a593Smuzhiyunops mmap_lock PageLocked(page) 616*4882a593Smuzhiyun============= ========= =========================== 617*4882a593Smuzhiyunopen: yes 618*4882a593Smuzhiyunclose: yes 619*4882a593Smuzhiyunfault: yes can return with page locked 620*4882a593Smuzhiyunmap_pages: yes 621*4882a593Smuzhiyunpage_mkwrite: yes can return with page locked 622*4882a593Smuzhiyunpfn_mkwrite: yes 623*4882a593Smuzhiyunaccess: yes 624*4882a593Smuzhiyun============= ========= =========================== 625*4882a593Smuzhiyun 626*4882a593Smuzhiyun->fault() is called when a previously not present pte is about 627*4882a593Smuzhiyunto be faulted in. The filesystem must find and return the page associated 628*4882a593Smuzhiyunwith the passed in "pgoff" in the vm_fault structure. If it is possible that 629*4882a593Smuzhiyunthe page may be truncated and/or invalidated, then the filesystem must lock 630*4882a593Smuzhiyunthe page, then ensure it is not already truncated (the page lock will block 631*4882a593Smuzhiyunsubsequent truncate), and then return with VM_FAULT_LOCKED, and the page 632*4882a593Smuzhiyunlocked. The VM will unlock the page. 633*4882a593Smuzhiyun 634*4882a593Smuzhiyun->map_pages() is called when VM asks to map easy accessible pages. 635*4882a593SmuzhiyunFilesystem should find and map pages associated with offsets from "start_pgoff" 636*4882a593Smuzhiyuntill "end_pgoff". ->map_pages() is called with page table locked and must 637*4882a593Smuzhiyunnot block. If it's not possible to reach a page without blocking, 638*4882a593Smuzhiyunfilesystem should skip it. Filesystem should use do_set_pte() to setup 639*4882a593Smuzhiyunpage table entry. Pointer to entry associated with the page is passed in 640*4882a593Smuzhiyun"pte" field in vm_fault structure. Pointers to entries for other offsets 641*4882a593Smuzhiyunshould be calculated relative to "pte". 642*4882a593Smuzhiyun 643*4882a593Smuzhiyun->page_mkwrite() is called when a previously read-only pte is 644*4882a593Smuzhiyunabout to become writeable. The filesystem again must ensure that there are 645*4882a593Smuzhiyunno truncate/invalidate races, and then return with the page locked. If 646*4882a593Smuzhiyunthe page has been truncated, the filesystem should not look up a new page 647*4882a593Smuzhiyunlike the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which 648*4882a593Smuzhiyunwill cause the VM to retry the fault. 649*4882a593Smuzhiyun 650*4882a593Smuzhiyun->pfn_mkwrite() is the same as page_mkwrite but when the pte is 651*4882a593SmuzhiyunVM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is 652*4882a593SmuzhiyunVM_FAULT_NOPAGE. Or one of the VM_FAULT_ERROR types. The default behavior 653*4882a593Smuzhiyunafter this call is to make the pte read-write, unless pfn_mkwrite returns 654*4882a593Smuzhiyunan error. 655*4882a593Smuzhiyun 656*4882a593Smuzhiyun->access() is called when get_user_pages() fails in 657*4882a593Smuzhiyunaccess_process_vm(), typically used to debug a process through 658*4882a593Smuzhiyun/proc/pid/mem or ptrace. This function is needed only for 659*4882a593SmuzhiyunVM_IO | VM_PFNMAP VMAs. 660*4882a593Smuzhiyun 661*4882a593Smuzhiyun-------------------------------------------------------------------------------- 662*4882a593Smuzhiyun 663*4882a593Smuzhiyun Dubious stuff 664*4882a593Smuzhiyun 665*4882a593Smuzhiyun(if you break something or notice that it is broken and do not fix it yourself 666*4882a593Smuzhiyun- at least put it here) 667