1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2*4882a593Smuzhiyun /* 3*4882a593Smuzhiyun * include/linux/userfaultfd.h 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 6*4882a593Smuzhiyun * Copyright (C) 2015 Red Hat, Inc. 7*4882a593Smuzhiyun * 8*4882a593Smuzhiyun */ 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun #ifndef _LINUX_USERFAULTFD_H 11*4882a593Smuzhiyun #define _LINUX_USERFAULTFD_H 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun #include <linux/types.h> 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun /* 16*4882a593Smuzhiyun * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and 17*4882a593Smuzhiyun * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In 18*4882a593Smuzhiyun * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ 19*4882a593Smuzhiyun * means the userland is reading). 20*4882a593Smuzhiyun */ 21*4882a593Smuzhiyun #define UFFD_API ((__u64)0xAA) 22*4882a593Smuzhiyun #define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ 23*4882a593Smuzhiyun UFFDIO_REGISTER_MODE_WP | \ 24*4882a593Smuzhiyun UFFDIO_REGISTER_MODE_MINOR) 25*4882a593Smuzhiyun #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ 26*4882a593Smuzhiyun UFFD_FEATURE_EVENT_FORK | \ 27*4882a593Smuzhiyun UFFD_FEATURE_EVENT_REMAP | \ 28*4882a593Smuzhiyun UFFD_FEATURE_EVENT_REMOVE | \ 29*4882a593Smuzhiyun UFFD_FEATURE_EVENT_UNMAP | \ 30*4882a593Smuzhiyun UFFD_FEATURE_MISSING_HUGETLBFS | \ 31*4882a593Smuzhiyun UFFD_FEATURE_MISSING_SHMEM | \ 32*4882a593Smuzhiyun UFFD_FEATURE_SIGBUS | \ 33*4882a593Smuzhiyun UFFD_FEATURE_THREAD_ID | \ 34*4882a593Smuzhiyun UFFD_FEATURE_MINOR_HUGETLBFS | \ 35*4882a593Smuzhiyun UFFD_FEATURE_MINOR_SHMEM) 36*4882a593Smuzhiyun #define UFFD_API_IOCTLS \ 37*4882a593Smuzhiyun ((__u64)1 << _UFFDIO_REGISTER | \ 38*4882a593Smuzhiyun (__u64)1 << _UFFDIO_UNREGISTER | \ 39*4882a593Smuzhiyun (__u64)1 << _UFFDIO_API) 40*4882a593Smuzhiyun #define UFFD_API_RANGE_IOCTLS \ 41*4882a593Smuzhiyun ((__u64)1 << _UFFDIO_WAKE | \ 42*4882a593Smuzhiyun (__u64)1 << _UFFDIO_COPY | \ 43*4882a593Smuzhiyun (__u64)1 << _UFFDIO_ZEROPAGE | \ 44*4882a593Smuzhiyun (__u64)1 << _UFFDIO_WRITEPROTECT | \ 45*4882a593Smuzhiyun (__u64)1 << _UFFDIO_CONTINUE) 46*4882a593Smuzhiyun #define UFFD_API_RANGE_IOCTLS_BASIC \ 47*4882a593Smuzhiyun ((__u64)1 << _UFFDIO_WAKE | \ 48*4882a593Smuzhiyun (__u64)1 << _UFFDIO_COPY | \ 49*4882a593Smuzhiyun (__u64)1 << _UFFDIO_CONTINUE) 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun /* 52*4882a593Smuzhiyun * Valid ioctl command number range with this API is from 0x00 to 53*4882a593Smuzhiyun * 0x3F. UFFDIO_API is the fixed number, everything else can be 54*4882a593Smuzhiyun * changed by implementing a different UFFD_API. If sticking to the 55*4882a593Smuzhiyun * same UFFD_API more ioctl can be added and userland will be aware of 56*4882a593Smuzhiyun * which ioctl the running kernel implements through the ioctl command 57*4882a593Smuzhiyun * bitmask written by the UFFDIO_API. 58*4882a593Smuzhiyun */ 59*4882a593Smuzhiyun #define _UFFDIO_REGISTER (0x00) 60*4882a593Smuzhiyun #define _UFFDIO_UNREGISTER (0x01) 61*4882a593Smuzhiyun #define _UFFDIO_WAKE (0x02) 62*4882a593Smuzhiyun #define _UFFDIO_COPY (0x03) 63*4882a593Smuzhiyun #define _UFFDIO_ZEROPAGE (0x04) 64*4882a593Smuzhiyun #define _UFFDIO_WRITEPROTECT (0x06) 65*4882a593Smuzhiyun #define _UFFDIO_CONTINUE (0x07) 66*4882a593Smuzhiyun #define _UFFDIO_API (0x3F) 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun /* userfaultfd ioctl ids */ 69*4882a593Smuzhiyun #define UFFDIO 0xAA 70*4882a593Smuzhiyun #define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ 71*4882a593Smuzhiyun struct uffdio_api) 72*4882a593Smuzhiyun #define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ 73*4882a593Smuzhiyun struct uffdio_register) 74*4882a593Smuzhiyun #define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ 75*4882a593Smuzhiyun struct uffdio_range) 76*4882a593Smuzhiyun #define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ 77*4882a593Smuzhiyun struct uffdio_range) 78*4882a593Smuzhiyun #define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ 79*4882a593Smuzhiyun struct uffdio_copy) 80*4882a593Smuzhiyun #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ 81*4882a593Smuzhiyun struct uffdio_zeropage) 82*4882a593Smuzhiyun #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ 83*4882a593Smuzhiyun struct uffdio_writeprotect) 84*4882a593Smuzhiyun #define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ 85*4882a593Smuzhiyun struct uffdio_continue) 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun /* read() structure */ 88*4882a593Smuzhiyun struct uffd_msg { 89*4882a593Smuzhiyun __u8 event; 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun __u8 reserved1; 92*4882a593Smuzhiyun __u16 reserved2; 93*4882a593Smuzhiyun __u32 reserved3; 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun union { 96*4882a593Smuzhiyun struct { 97*4882a593Smuzhiyun __u64 flags; 98*4882a593Smuzhiyun __u64 address; 99*4882a593Smuzhiyun union { 100*4882a593Smuzhiyun __u32 ptid; 101*4882a593Smuzhiyun } feat; 102*4882a593Smuzhiyun } pagefault; 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun struct { 105*4882a593Smuzhiyun __u32 ufd; 106*4882a593Smuzhiyun } fork; 107*4882a593Smuzhiyun 108*4882a593Smuzhiyun struct { 109*4882a593Smuzhiyun __u64 from; 110*4882a593Smuzhiyun __u64 to; 111*4882a593Smuzhiyun __u64 len; 112*4882a593Smuzhiyun } remap; 113*4882a593Smuzhiyun 114*4882a593Smuzhiyun struct { 115*4882a593Smuzhiyun __u64 start; 116*4882a593Smuzhiyun __u64 end; 117*4882a593Smuzhiyun } remove; 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun struct { 120*4882a593Smuzhiyun /* unused reserved fields */ 121*4882a593Smuzhiyun __u64 reserved1; 122*4882a593Smuzhiyun __u64 reserved2; 123*4882a593Smuzhiyun __u64 reserved3; 124*4882a593Smuzhiyun } reserved; 125*4882a593Smuzhiyun } arg; 126*4882a593Smuzhiyun } __packed; 127*4882a593Smuzhiyun 128*4882a593Smuzhiyun /* 129*4882a593Smuzhiyun * Start at 0x12 and not at 0 to be more strict against bugs. 130*4882a593Smuzhiyun */ 131*4882a593Smuzhiyun #define UFFD_EVENT_PAGEFAULT 0x12 132*4882a593Smuzhiyun #define UFFD_EVENT_FORK 0x13 133*4882a593Smuzhiyun #define UFFD_EVENT_REMAP 0x14 134*4882a593Smuzhiyun #define UFFD_EVENT_REMOVE 0x15 135*4882a593Smuzhiyun #define UFFD_EVENT_UNMAP 0x16 136*4882a593Smuzhiyun 137*4882a593Smuzhiyun /* flags for UFFD_EVENT_PAGEFAULT */ 138*4882a593Smuzhiyun #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ 139*4882a593Smuzhiyun #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ 140*4882a593Smuzhiyun #define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ 141*4882a593Smuzhiyun 142*4882a593Smuzhiyun struct uffdio_api { 143*4882a593Smuzhiyun /* userland asks for an API number and the features to enable */ 144*4882a593Smuzhiyun __u64 api; 145*4882a593Smuzhiyun /* 146*4882a593Smuzhiyun * Kernel answers below with the all available features for 147*4882a593Smuzhiyun * the API, this notifies userland of which events and/or 148*4882a593Smuzhiyun * which flags for each event are enabled in the current 149*4882a593Smuzhiyun * kernel. 150*4882a593Smuzhiyun * 151*4882a593Smuzhiyun * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE 152*4882a593Smuzhiyun * are to be considered implicitly always enabled in all kernels as 153*4882a593Smuzhiyun * long as the uffdio_api.api requested matches UFFD_API. 154*4882a593Smuzhiyun * 155*4882a593Smuzhiyun * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER 156*4882a593Smuzhiyun * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on 157*4882a593Smuzhiyun * hugetlbfs virtual memory ranges. Adding or not adding 158*4882a593Smuzhiyun * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has 159*4882a593Smuzhiyun * no real functional effect after UFFDIO_API returns, but 160*4882a593Smuzhiyun * it's only useful for an initial feature set probe at 161*4882a593Smuzhiyun * UFFDIO_API time. There are two ways to use it: 162*4882a593Smuzhiyun * 163*4882a593Smuzhiyun * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the 164*4882a593Smuzhiyun * uffdio_api.features before calling UFFDIO_API, an error 165*4882a593Smuzhiyun * will be returned by UFFDIO_API on a kernel without 166*4882a593Smuzhiyun * hugetlbfs missing support 167*4882a593Smuzhiyun * 168*4882a593Smuzhiyun * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in 169*4882a593Smuzhiyun * uffdio_api.features and instead it will be set by the 170*4882a593Smuzhiyun * kernel in the uffdio_api.features if the kernel supports 171*4882a593Smuzhiyun * it, so userland can later check if the feature flag is 172*4882a593Smuzhiyun * present in uffdio_api.features after UFFDIO_API 173*4882a593Smuzhiyun * succeeded. 174*4882a593Smuzhiyun * 175*4882a593Smuzhiyun * UFFD_FEATURE_MISSING_SHMEM works the same as 176*4882a593Smuzhiyun * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem 177*4882a593Smuzhiyun * (i.e. tmpfs and other shmem based APIs). 178*4882a593Smuzhiyun * 179*4882a593Smuzhiyun * UFFD_FEATURE_SIGBUS feature means no page-fault 180*4882a593Smuzhiyun * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead 181*4882a593Smuzhiyun * a SIGBUS signal will be sent to the faulting process. 182*4882a593Smuzhiyun * 183*4882a593Smuzhiyun * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will 184*4882a593Smuzhiyun * be returned, if feature is not requested 0 will be returned. 185*4882a593Smuzhiyun * 186*4882a593Smuzhiyun * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults 187*4882a593Smuzhiyun * can be intercepted (via REGISTER_MODE_MINOR) for 188*4882a593Smuzhiyun * hugetlbfs-backed pages. 189*4882a593Smuzhiyun * 190*4882a593Smuzhiyun * UFFD_FEATURE_MINOR_SHMEM indicates the same support as 191*4882a593Smuzhiyun * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. 192*4882a593Smuzhiyun */ 193*4882a593Smuzhiyun #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) 194*4882a593Smuzhiyun #define UFFD_FEATURE_EVENT_FORK (1<<1) 195*4882a593Smuzhiyun #define UFFD_FEATURE_EVENT_REMAP (1<<2) 196*4882a593Smuzhiyun #define UFFD_FEATURE_EVENT_REMOVE (1<<3) 197*4882a593Smuzhiyun #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) 198*4882a593Smuzhiyun #define UFFD_FEATURE_MISSING_SHMEM (1<<5) 199*4882a593Smuzhiyun #define UFFD_FEATURE_EVENT_UNMAP (1<<6) 200*4882a593Smuzhiyun #define UFFD_FEATURE_SIGBUS (1<<7) 201*4882a593Smuzhiyun #define UFFD_FEATURE_THREAD_ID (1<<8) 202*4882a593Smuzhiyun #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) 203*4882a593Smuzhiyun #define UFFD_FEATURE_MINOR_SHMEM (1<<10) 204*4882a593Smuzhiyun __u64 features; 205*4882a593Smuzhiyun 206*4882a593Smuzhiyun __u64 ioctls; 207*4882a593Smuzhiyun }; 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun struct uffdio_range { 210*4882a593Smuzhiyun __u64 start; 211*4882a593Smuzhiyun __u64 len; 212*4882a593Smuzhiyun }; 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun struct uffdio_register { 215*4882a593Smuzhiyun struct uffdio_range range; 216*4882a593Smuzhiyun #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) 217*4882a593Smuzhiyun #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) 218*4882a593Smuzhiyun #define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) 219*4882a593Smuzhiyun __u64 mode; 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun /* 222*4882a593Smuzhiyun * kernel answers which ioctl commands are available for the 223*4882a593Smuzhiyun * range, keep at the end as the last 8 bytes aren't read. 224*4882a593Smuzhiyun */ 225*4882a593Smuzhiyun __u64 ioctls; 226*4882a593Smuzhiyun }; 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun struct uffdio_copy { 229*4882a593Smuzhiyun __u64 dst; 230*4882a593Smuzhiyun __u64 src; 231*4882a593Smuzhiyun __u64 len; 232*4882a593Smuzhiyun #define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) 233*4882a593Smuzhiyun /* 234*4882a593Smuzhiyun * UFFDIO_COPY_MODE_WP will map the page write protected on 235*4882a593Smuzhiyun * the fly. UFFDIO_COPY_MODE_WP is available only if the 236*4882a593Smuzhiyun * write protected ioctl is implemented for the range 237*4882a593Smuzhiyun * according to the uffdio_register.ioctls. 238*4882a593Smuzhiyun */ 239*4882a593Smuzhiyun #define UFFDIO_COPY_MODE_WP ((__u64)1<<1) 240*4882a593Smuzhiyun __u64 mode; 241*4882a593Smuzhiyun 242*4882a593Smuzhiyun /* 243*4882a593Smuzhiyun * "copy" is written by the ioctl and must be at the end: the 244*4882a593Smuzhiyun * copy_from_user will not read the last 8 bytes. 245*4882a593Smuzhiyun */ 246*4882a593Smuzhiyun __s64 copy; 247*4882a593Smuzhiyun }; 248*4882a593Smuzhiyun 249*4882a593Smuzhiyun struct uffdio_zeropage { 250*4882a593Smuzhiyun struct uffdio_range range; 251*4882a593Smuzhiyun #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) 252*4882a593Smuzhiyun __u64 mode; 253*4882a593Smuzhiyun 254*4882a593Smuzhiyun /* 255*4882a593Smuzhiyun * "zeropage" is written by the ioctl and must be at the end: 256*4882a593Smuzhiyun * the copy_from_user will not read the last 8 bytes. 257*4882a593Smuzhiyun */ 258*4882a593Smuzhiyun __s64 zeropage; 259*4882a593Smuzhiyun }; 260*4882a593Smuzhiyun 261*4882a593Smuzhiyun struct uffdio_writeprotect { 262*4882a593Smuzhiyun struct uffdio_range range; 263*4882a593Smuzhiyun /* 264*4882a593Smuzhiyun * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, 265*4882a593Smuzhiyun * unset the flag to undo protection of a range which was previously 266*4882a593Smuzhiyun * write protected. 267*4882a593Smuzhiyun * 268*4882a593Smuzhiyun * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up 269*4882a593Smuzhiyun * any wait thread after the operation succeeds. 270*4882a593Smuzhiyun * 271*4882a593Smuzhiyun * NOTE: Write protecting a region (WP=1) is unrelated to page faults, 272*4882a593Smuzhiyun * therefore DONTWAKE flag is meaningless with WP=1. Removing write 273*4882a593Smuzhiyun * protection (WP=0) in response to a page fault wakes the faulting 274*4882a593Smuzhiyun * task unless DONTWAKE is set. 275*4882a593Smuzhiyun */ 276*4882a593Smuzhiyun #define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) 277*4882a593Smuzhiyun #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) 278*4882a593Smuzhiyun __u64 mode; 279*4882a593Smuzhiyun }; 280*4882a593Smuzhiyun 281*4882a593Smuzhiyun struct uffdio_continue { 282*4882a593Smuzhiyun struct uffdio_range range; 283*4882a593Smuzhiyun #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) 284*4882a593Smuzhiyun __u64 mode; 285*4882a593Smuzhiyun 286*4882a593Smuzhiyun /* 287*4882a593Smuzhiyun * Fields below here are written by the ioctl and must be at the end: 288*4882a593Smuzhiyun * the copy_from_user will not read past here. 289*4882a593Smuzhiyun */ 290*4882a593Smuzhiyun __s64 mapped; 291*4882a593Smuzhiyun }; 292*4882a593Smuzhiyun 293*4882a593Smuzhiyun /* 294*4882a593Smuzhiyun * Flags for the userfaultfd(2) system call itself. 295*4882a593Smuzhiyun */ 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun /* 298*4882a593Smuzhiyun * Create a userfaultfd that can handle page faults only in user mode. 299*4882a593Smuzhiyun */ 300*4882a593Smuzhiyun #define UFFD_USER_MODE_ONLY 1 301*4882a593Smuzhiyun 302*4882a593Smuzhiyun #endif /* _LINUX_USERFAULTFD_H */ 303